使用selenium用ISBN在京东上批量爬取书籍信息

 

首先读取 .xls 文件,然后根据表格里的ISBN在京东上挨个搜索,再把需要的信息从网页上提取出来保存在另一个文件里。

每次运行 .py 文件后打开浏览器会弹出登录页面(30s),在此期间手动登录,30秒后开始爬取。

#!/usr/bin/python
# -*- coding: UTF-8 -*-

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import json
from selenium.webdriver.common.keys import Keys
from lxml import etree
import xlrd
import xlwt
import datetime
from time import sleep


# options = webdriver.ChromeOptions()
# options.add_argument(‘--headless‘)
# options.add_argument(‘--no-sandbox‘)
# options.add_argument(‘--disable-gpu‘)
# options.add_argument(‘--disable-dev-shm-usage‘)
# driver = webdriver.Chrome(chrome_options=options)

data_dict = tDict = {ISBN: 0000000000000, 出版时间: 0000-00-00, 版次: 1}

driver = webdriver.Chrome()

def test01_login():
    driver = webdriver.Chrome()
    driver.get(
        "https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd.com%2F")

    sleep(30)  # 手动登陆一次

    cookies = driver.get_cookies()
    # 将 cookies 写入文件
    with open("cookies.txt", "w") as f:
        json.dump(cookies, f)

def singleData(para):
    try:
        driver.get(https://www.jd.com/)

        # 加载 cookies信息
        with open("cookies.txt", "r") as f:
            cookies = json.load(f)
            for cookie in cookies:
                driver.add_cookie(cookie)

        driver.find_element_by_id("key").send_keys(para)

        driver.find_element_by_xpath(//*[@id="search"]/div/div[2]/button/i).click()

        sleep(3)
        html = driver.page_source
        h = etree.HTML(html)
        # 在搜索到的结果中仅取一条链接
    
        driver.get("https:" + h.xpath(//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href)[0])
        
        html = driver.page_source
        h = etree.HTML(html)

        # 获得所爬信息
        list = h.xpath(//div/ul[@class="parameter2 p-parameter-list"]/li/text())

        for li in list:
            if li.lstrip().startswith(ISBN):  # 消去左边的空格,然后检测以“ISBN”开头的一条
                data_dict["ISBN"] = li
            if li.lstrip().startswith(出版时间):
                data_dict["出版时间"] = li
            if li.lstrip().startswith(版次):
                data_dict["版次"] = li

        # driver.close()
        return data_dict


    except Exception as e:
        # error occurred, log ‘e‘, etc.
        with open("exception.txt", "a", encoding="utf-8") as f:
            f.write(str(e) + "\n")
            f.close()


readbook = xlrd.open_workbook(rtable.xls)
SheetOfInput = readbook.sheet_by_name(Sheet1) 
nrows = SheetOfInput.nrows  # 获取最大行

writebook = xlwt.Workbook(encoding="utf8")  # 打开一个excel
SheetOfOutput = writebook.add_sheet(test)  # 在打开的excel中添加一个sheet

test01_login()


for gi in range(0,nrows):
    try:
        lng = SheetOfInput.cell(gi, 4).value  # 获取i行3列的表格值
        tDict = singleData(lng)

        SheetOfOutput.write(gi, 0, tDict["ISBN"])
        SheetOfOutput.write(gi, 1, tDict["出版时间"])
        SheetOfOutput.write(gi, 2, tDict["版次"])
        writebook.save(answer.xls) 
        print(tDict["ISBN"] = %s, tDict["出版时间"] = %s, tDict["版次"] = %s, gi = %d.  %(tDict["ISBN"], tDict["出版时间"], tDict["版次"], gi))
    except Exception as e:
        # error occurred, log ‘e‘, etc.
        with open("exception.txt", "a", encoding="utf-8") as f:
            f.write(str(e) + "\n")
            f.close()

driver.quit()



#######################################
# 定义一个爬虫函数,针对单条isbn进行爬取,返回一个字典
# 打开table,读取isbn号,
# 调用定义的函数,然后将返回的字典写入table

 

使用selenium用ISBN在京东上批量爬取书籍信息

上一篇:pyhton基础知识——if语句


下一篇:vue项目中如何使用有向无环图(dag-diagram)