node.js爬虫入门导出json文件并导入数据库（二）

2023-01-31 11:51:21

离上个月入门一半个多月了，如今数据库已配，现在就是加数据，服务器配置

实际项目中还是会遇到坑，比如今天的乱码，偏老的网站gbk2312；有想把线上地址图片截取图片名，放在自己的项目路径中；还有有些网站有反扒。

//导入依赖包
const fs = require("fs");

const superagent = require("superagent");
const cheerio = require("cheerio");
const mongoose = require('mongoose');
var charset = require("superagent-charset")
charset(superagent);
const url = 'mongodb://xxxx:xxxx@xxxxx/mywebsite'

mongoose.connect(url, {
    useNewUrlParser: true
}, function (err) {
    if (err) {
        console.log('数据库连接失败');
        throw err;
    }
    console.log('数据库连接成功')
})


const bookSchema = new mongoose.Schema({
    bookName: {
        type: String,
        uniqe: true, //唯一
        trim: true, //去除空格
    },
    bookImg: {
        type: String,
        trim: true, //去除空格
    },
    bookPrice: {
        type: String,
        trim: true, //去除空格
    }
})

var bookModel = mongoose.model('Book', bookSchema, 'book');

superagent
    .get("http://category.dangdang.com/cp01.54.06.00.00.00.html") //地址倒是改成你目的地址
    .charset('gbk') //编码
    .end((error, response) => {
        //获取页面文档数据
        var content = response.text;
        //cheerio也就是nodejs下的jQuery  将整个文档包装成一个集合，定义一个变量$接收
        var $ = cheerio.load(content);
        //定义一个空数组，用来接收数据
        var result = [];
        //分析文档结构  先获取每个li 再遍历里面的内容(此时每个li里面就存放着我们想要获取的数据)
        $("#component_59>li").each((index, value) => {
            var imgPath = $(value).find("img").attr('src'); 
            // var imgPath = $(value).find("img").attr('data-original') || $(value).find("img").attr('src'); 解决
            var filename;
            if(imgPath.indexOf("/") > 0) { //如果包含有"/"号 从最后一个"/"号+1的位置开始截取字符串
                filename = imgPath.substring(imgPath.lastIndexOf("/") + 1, imgPath.length);
            } else {
                filename = imgPath;
            }
            var img = filename;
            var money = $(value).find(".name a").text();
            // var company = $(value).find(".detail").text();
            var price = $(value).find(".price .search_now_price").text();
            var item = {
                bookName: money,
                bookImg: img,
                bookPrice: price,
            }
            var book = new bookModel(item);
            book.save();

            result.push(item);
          
            //此处可以插入数据库
        });
        //将数组转换成字符串
        result = JSON.stringify(result);
        fs.writeFile("boss1.json", result, "utf-8", (error) => {
            //监听错误，如正常输出，则打印null
            if (error == null) {
                console.log("恭喜您，数据爬取成功!)");
            }
        });
    });
// connection.end(); 如果不注销的会报错 2013

方法是对的，但后面图片都是url_none.png，发现网页上也确实有这张图片，有图片懒加载的原因，处理好就好了。

下载图片
新建node项目,同一级别加img文件家

const request = require('request')
const fs = require('fs')
const cheerio = require('cheerio')
request('http://book.dangdang.com/01.54.htm?ref=book-01-A', function (error, response, body) {
    console.log('error:', error); // 错误优先
    console.log('statusCode:', response && response.statusCode); // Print the response status code if a response was received
    //获取爬取网站的页面信息
    const $ = cheerio.load(body)
    let imgs = []
    //目标网站图片链接地址数组
    // 用正则判断数组中的路径是否存在https
    var _ = /(http[s]?|ftp)/;
    $('img').each((i, e) => {  // 遍历所有
        var src = $(e).attr('src');

        if (!_.test(src)) {
            src = src.replace(/\/{2}/, 'https://') //因为有些图片不可下载，所以用正则判断一下
        }
        imgs.push(src)
    })
    // 下载数组里的图片
    for (let index = 0; index < imgs.length; index++) {
        if (imgs[index].indexOf('http') !== -1) {
            var filename;
            if(imgs[index].indexOf("/") > 0) { //如果包含有"/"号 从最后一个"/"号+1的位置开始截取字符串
                filename = imgs[index].substring(imgs[index].lastIndexOf("/") + 1, imgs[index].length);
            } else {
                filename = imgs[index];
            }
            request(imgs[index]).pipe(fs.createWriteStream(`./img/${filename}`)) //同一级加img文件夹
        }
    }
})

那现在只要把图片布置到远程上，处理图片路径就行了。

码农公寓

相关文章