Nodejs爬虫实战(五)

1. 抓取标签内容

  1. 引入模块

    新模块jsdom中的JSDOM

  2. 创建对象

     let DOM = new JSDOM(html);
     let document = DOM.window.document;
    
  1. dom操作

     document.querySelector('.tm-count').innerHTML
    
###### 完整代码


    var index = 0;
    const fs = require('fs');
    const url = require('url');
    const gbk = require('gbk');
    const JSDOM = require('jsdom').JSDOM;
    
    GetUrl('https://detail.tmall.com/item.htm?id=548466958386&ali_refid=a3_430583_1006:1103419234:N:%E5%8D%8E%E4%B8%BA:bb84ee4c8f67c7b202d725187b7ad429&ali_trackid=1_bb84ee4c8f67c7b202d725187b7ad429&spm=a230r.1.14.1&sku_properties=5919063:6536025;12304035:116177',(data)=>{
    
        var html = gbk.toString('utf-8',data);
    
        let DOM = new JSDOM(html);
        let document = DOM.window.document;
    
        console.log(document.querySelector('.tm-count').innerHTML)
    })
    function GetUrl(sUrl,success){
        index++;
        var urlObj = url.parse(sUrl);
        var http ='';
        if(urlObj.protocol == 'http:'){
            http = require('http');
        }
        else{
            http = require('https');
        }
    
        let req = http.request({
            'hostname':urlObj.hostname,
            'path':urlObj.path
        },res=>{
            if(res.statusCode == 200){
                var arr = [];
                var str = '';
                res.on('data',buffer=>{
                    arr.push(buffer);
                    //str +=buffer;
                });
                res.on('end',()=>{
                    let b = Buffer.concat(arr);
    
                    success && success(b);
    
                })
            }
            else if(res.statusCode == 302 || res.statusCode == 301){
                console.log(`第${index}次重定向`,res.headers.location);
                GetUrl(res.headers.location,success)
            }
        });
    
        req.end();
        req.on('error',()=>{
            console.log('404');
        })
    }

文章目录