python爬虫必备知识点, 代理请求数据

爬取免费代理,并且解析代理是否可用 网址为:

http://www.ip3366.net/?stype=1&page=1

 

#1.确认爬取的地址
#2.发送地址请求
#3.数据解析
#4.数据保存


import requests
import parsel
import time
def check_ip(proxiles_list):
    """检测代理可用性"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
        }

    can_use = []
    for proxies  in proxiles_list:
        try:
            response  = requests.get(url='https://www.baidu.com',headers=headers,
                         proxies=proxies,timeout=2)

            if  response.status_code == 200:
                can_use.append(proxies)
        except:
            print('当前代理:',proxies,'请求超时,代理不可用')
        else:
            print('当前代理:',proxies,'****代理可用****')

    return can_use

proxy_list = []

for page in range(1,11):
    time.sleep(2)
    #1.确认爬取的地址
    url = f"http://www.ip3366.net/?stype=1&page={page}"
    headers =  {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}

    #2.发送地址请求
    response = requests.get(url=url,headers=headers)
    response.encoding = response.apparent_encoding   #自动识别响应体的编码
    html_data = response.text
    #print(html_data)

    #3.数据解析   xpath   专门用来提取html数据
    selector = parsel.Selector(html_data)   #数据转换
    trs = selector.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')


    for tr in trs:           #二次提取
        ip = tr.xpath('./td[1]/text()').get()   #ip
        port = tr.xpath('./td[2]/text()').get()   #端口
        #print(ip,port)

        ip_proxy = ip + ':' + port
        proxies_dict = {
            "http": "http://" + ip_proxy,
            "https": "https://" + ip_proxy,
        }
        print('保存完成',proxies_dict)
        proxy_list.append(proxies_dict)


    print(proxy_list)
    print('获取到的代理数量:',len(proxy_list))

print('***************************正在检测代理***************************')
can_use = check_ip(proxy_list)
print('可用的代理有:',can_use)
print('可用的代理数量:',len(can_use))

 

上一篇:高质量代理ip+ssl证书解决


下一篇:2021-01-12