爬虫之hao6v电影

spider

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import Hao6VItem

import re

class A6vSpider(CrawlSpider):
    name = '6v'
    allowed_domains = ['www.hao6v.tv','www.6vw.cc','www.hao6v.cc', 'www.6vhao.tv','www.6vhao.com','www.6vhao.net', 'www.dy131.net', 'www.6vgood.com','www.hao6v.net']
    start_urls = ['https://www.hao6v.tv']

    rules = (
        Rule(LinkExtractor(allow=r'/\d{4}-\d{2}-\d{2}/'), callback='parse_itemA', follow=True),
        Rule(LinkExtractor(allow=r'/\d{4}-\d{2}-\d{2}/'), callback='parse_itemB', follow=True)
    )


    def parse_itemA(self, response):
#        response =HtmlResponse()
#        print(response.url)
        电影名 = response.xpath("//title/text()").re('《(.*)》')
        if len(电影名)<1:
            return 
        电影名=电影名[0]   

        内容 = response.xpath("//meta[@name='description']/@content").extract()
        if len(内容)<1:
            return 
        内容=内容[0]
        内容 = 内容.split("<br />")
        内容1 = str(内容).replace(r"\u3000","")    
        内容2 = str(内容1).replace(r" ",r"")
        内容3 = str(内容2).replace(r"\r\n","")
        内容4 = str(内容3).replace(r"&middot","")
        内容5 = str(内容4).replace(r"】","")
        内容6 = str(内容5).replace(r"]:","")
        内容7 = str(内容6).replace(r"]:","")
        内容8 = str(内容7).replace(r":","")
    
        t = 内容8[2:-2]
        try:
            译名= "译名(.*?)'"
            片名= "片名(.*?)'"
            年代= "年代(.*?)'"
            产地= "产地(.*?)'"
            类别= "类别(.*?)'"
            片长= "片长(.*?)'"
            简介= "简介(.*?)'"
            item = [译名,片名,年代,产地,类别,片长,简介]
            res = []
            for i in range(len(item)):
                a = re.search(item[i],t)
                if a:  
                    a =t[a.start():a.end()]
                    a = a[2:-1]
                else:
                    a = None
                res.append(a)
        except:
            pass
        
        yield Hao6VItem(电影名 = 电影名,译名 = res[0],片名 = res[1],年代= res[2],产地= res[3],类别= res[4],片长= res[5],简介= res[6])

items

import scrapy


class Hao6VItem(scrapy.Item):
    # define the fields for your item here like:
#    译名 = res[0],片名 = res[1],年代= res[2]\
#            ,产地= res[3],类别= res[4],上映时间= res[5],片长= res[6]
     电影名 = scrapy.Field()   
     译名 = scrapy.Field()
     片名 = scrapy.Field()
     年代 = scrapy.Field()
     产地 = scrapy.Field()
     类别 = scrapy.Field()
     片长 = scrapy.Field()
     简介 = scrapy.Field()


class Hao6VItem1(scrapy.Item):
    # define the fields for your item here like:
     电影名 = scrapy.Field()    # define the fields for your item here like:  
     磁力下载 = scrapy.Field()

pipelines

import pandas as pd
import sqlite3

from .items import Hao6VItem
from .items import Hao6VItem1

dbcon = sqlite3.connect("6v.db")

class Hao6VPipeline:
    def open_spider(self,spider):
        self.Link=[]
    
    def close_spider(self,spider):
        df = pd.DataFrame(self.Link)
        df.to_sql('电影概要',dbcon,if_exists='append')
        
    def process_item(self,item,spider):
        if not isinstance(item,Hao6VItem):
            return item
        self.Link.append(item)
        if len(self.Link)>50:
            df = pd.DataFrame(self.Link)
            df.to_sql('电影概要',dbcon,if_exists='append')
            self.Link=[]
            


class Hao6VPipeline1:
    def open_spider(self,spider):
        self.Link=[]
    
    def close_spider(self,spider):
        df = pd.DataFrame(self.Link)
        df.to_sql('磁力链接',dbcon,if_exists='append')
        
    def process_item(self,item,spider):
        if not isinstance(item,Hao6VItem1):
            return item

        self.Link.append(item)
        if len(self.Link)>100:
            df = pd.DataFrame(self.Link)
            df.to_sql('磁力链接',dbcon,if_exists='append')
            self.Link=[]

上一篇:[Maven实战](5)Archetype生成项目骨架


下一篇:python爬虫基础知识