关于爬虫ip:爬取快代理的免费代理ip

import requests
import parsel
import time,random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
def get_ips():
    proxies_list = []
    for page in range(1,5):
        print("正在爬取第{}页的数据".format(str(page)))
        url = "https://www.kuaidaili.com/free/inha/{}".format(str(page))
        response = requests.get(url,headers=headers)
        data = response.text

        html_data = parsel.Selector(data)

        tr_parse = html_data.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')

        for tr in tr_parse:
            tmp_dict = {}
            Type = tr.xpath('./td[4]/text()').extract_first()
            IP = tr.xpath('./td[1]/text()').extract_first()
            PORT = tr.xpath('./td[2]/text()').extract_first()
            tmp_dict[Type] = IP+":"+PORT
            print(tmp_dict)
            proxies_list.append(tmp_dict)
        time.sleep(random.randint(1,5))

    print(proxies_list)
    print("获取到的id的数量",len(proxies_list))
    return proxies_list

# 检查代理ip的可用性
def check_ip(proxies_list):
    can_use = []
    for pro in proxies_list:
        try:
            response = requests.get("https://www.baidu.com",headers=headers,proxies = pro,timeout = 0.1)
            if response.status_code==200:
                can_use.append(pro)
        except Exception as e:
            print(pro,e)
    return can_use

if __name__ == '__main__':
    proxies_list = get_ips()
    ips = check_ip(proxies_list)
    print("能用的ip",ips)
    print("能用的ip的数量",len(ips))

笔记整理于青灯教育公开课

上一篇:爬虫小插曲:关于代理参数-proxies那些事


下一篇:requests模块使用代理