Nodejs爬虫实战(三)

1. 抽取函数处理

  1. 引入模块
  2. http协议和https协议两种,既有不同,那么模块引入创建的变量自然不同
  1. url对象的parse方法能获得http或者https协议的信息。以http://example.com:8080/one?a为例打印。

     {
         protocol : 'http:' ,
         auth : null ,
         host : 'example.com:8080' ,
         port : '8080' ,
         hostname : 'example.com' ,
         hash : null ,
         search : '?a=index&t=article&m=default',
         query : 'a=index&t=article&m=default',
         pathname : '/one',
         path : '/one?a=index&t=article&m=default',
         href : 'http://example.com:8080/one?a=index&t=article&m=default'
     }
    
  2. protocol属性保存了协议

     if(urlObj.protocol == 'http:'){
         http = require('http');
     }
     else{
         http = require('https');
     }
    
  3. 处理error页面

     req.on('error',()=>{
         console.log('404');
     })
    
####### 完整代码

    const fs = require('fs');
    const url = require('url')
    GetUrl('https://detail.tmall.com/item.htm?spm=a230r.1.14.6.68624507tWuF7E&id=560257961625&cm_id=140105335569ed55e27b&abbucket=18&sku_properties=10004:709990523',data=>{
        fs.writeFile('iponex.html',data);
    })
    function GetUrl(sUrl,success){
        var urlObj = url.parse(sUrl);
        var http ='';
        if(urlObj.protocol == 'http:'){
            http = require('http');
        }
        else{
            http = require('https');
        }
    
        let req = http.request({
            'hostname':urlObj.hostname,
            'path':urlObj.path
        },res=>{
            console.log(res)
            
            var arr = [];
            res.on('data',buffer=>{
                arr.push(buffer);
            });
            res.on('end',()=>{
                let b = Buffer.concat(arr);
                success && success(b);
            })
            
        });
    
        req.end();
        req.on('error',()=>{
            console.log('404');
        })
    }

文章目录