爬取猫眼和纵横中文网的榜单信息

猫眼电影top100`:

import re
import requests
import json

def getpage(url):
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57',
	'Cookie': '''__mta=150214537.1607495152037.1607513645218.1607513649349.10; uuid_n_v=v1; 
	uuid=606FF27039E711EB81C837695837D31FA9C7894AC0F94110AA7CA5C3F1FA097F; 
	_lxsdk_cuid=176462d6abdc8-0c3f995e3e7e64-5a301348-144000-176462d6abdc8; 
	_lxsdk=606FF27039E711EB81C837695837D31FA9C7894AC0F94110AA7CA5C3F1FA097F; 
	_csrf=ea8f3bd61b09e5dc9b1ed979c23977d03a368ddd96b7f4984b2ad0b18dae8241; 
	Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1607495150,1607513322; 
	__mta=150214537.1607495152037.1607495190317.1607513633276.8; 
	Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1607513649; _lxsdk_s=1764767217b-813-94f-283%7C%7C2'''
	}
	response = requests.get(url=url,headers=headers)
	if response.status_code == 200:
		return response.text
	else:
		return None

def gomessage(html):
	items = re.findall('<dd>.*?board-index.*?>(.*?)</i>.*?title=(.*?) class=.*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>.*?',html,re.S)
	for item in items:
		print(item)
		inputfile(item)

def inputfile(what):
	with open('猫眼电影排行榜.txt','a',encoding='utf-8') as f:
		f.write(json.dumps(what,ensure_ascii=False)+'\n')  #防止乱码


def main():
	for i in range(10):
		url = f'https://maoyan.com/board/4?offset={10*i}'
		html = getpage(url)
		gomessage(html)

main()

纵横中文网:

import requests
import re
import time,json

def getpage(url):
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57',
	'Cookie': '''ZHID=AD632CDD590D53418030A9491C6E21C7; 
	ver=2018; zhffr=www.baidu.com; sajssdk_2015_cross_new_user=1; 
	sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221764791c0d3cc5-0c2
	8a4f47b4783-5a301d45-1327104-1764791c0d4724%22%2C%22%24device_id%22%3A%2217647
	91c0d3cc5-0c28a4f47b4783-5a301d45-1327104-1764791c0d4724%22%2C%22pro
	ps%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6
	%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%
	3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baid
	u.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E
	5%80%BC%22%7D%7D; v_user=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D0dWqAJU6v
	kkdGTF8yBhy6WNh_nTqhnxs-62kVf1bo6RQ_fajtUDK2q4aabIUp0oL%26wd%3D%26eqid%3Da20a3
	f87000308dc000000065fd0c923%7Chttp%3A%2F%2Fwww.zongheng.com%2F%7C52738807; zh_
	visitTime=1607518503190; Hm_lvt_c202865d524849216eea846069349eb9=1607518503; 
	Hm_up_c202865d524849216eea846069349eb9=%7B%22uid_%22%3A%7B%22value%22%3A%22AD6
	32CDD590D53418030A9491C6E21C7%22%2C%22scope%22%3A1%7D%7D; JSESSIONID=abcDTubi
	qnrXX-gBDKhzx; zh_rba=true; Hm_lpvt_c202865d524849216eea846069349eb9=1607518541'''
	}
	response = requests.get(url=url,headers=headers)
	if response.status_code == 200:
		return response.text
	else:
		return None

def getmessage(html):
	pattern = re.compile('<div class="rank_d_list.*?bookName = (.*?)bookId.*?<div class="rank_d_b_info">(.*?)</div>.*?',re.S)
	items = re.findall(pattern,html)
	for item in items:
		print(item)
		writefile(item)

def writefile(what):
	with open('纵横中文网榜单.txt','a',encoding='utf-8') as fn :
		fn.write(json.dumps(what,ensure_ascii=False)+'\n')

def main():
	for i in range(1,10):
		url = f"http://www.zongheng.com/rank/details.html?rt=5&d=1&p={i}"
		html = getpage(url)
		getmessage(html)
		time.sleep(2)

main()
上一篇:使用爬虫来爬取QQ音乐上的歌曲


下一篇:2020年中国数字化产业已经开花了