原创:Python爬虫实战之爬取代理ip
EliotShen 人气:2编程的快乐只有在运行成功的那一刻才知道QAQ
目标网站:https://www.kuaidaili.com/free/inha/ #若有侵权请联系我
因为上面的代理都是http的所以没写这个判断
代码如下:
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 import urllib.request 4 import re 5 import time 6 n = 1 7 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} 8 def web(url): 9 req=urllib.request.Request(url=url,headers=headers) 10 response = urllib.request.urlopen(url) 11 html = response.read().decode('UTF-8','ignore') 12 ip = r'[0-9]+(?:\.[0-9]+){3}' 13 port = r'"PORT">(\d{0,1}\d{0,1}\d{0,1}\d{0,1}\d)<' 14 out = re.findall(ip,html) 15 out1 = re.findall(port,html) 16 i = 0 17 dictionary = {} 18 while i <= 14: 19 dictionary[out[i]] = out1[i] 20 store(dictionary) 21 i += 1 22 print(out,'\n',out1) 23 def store(dictionary): 24 for key in dictionary: 25 with open('ip.txt','a') as f: 26 c = 'ip:' + key + '\tport:' + dictionary[key] + '\n' 27 f.write(c) 28 print('store successfully') 29 while n <= 3313: 30 url1 = "https://www.kuaidaili.com/free/intr/" 31 url = url1 + str(n) +'/' 32 web(url) 33 time.sleep(5) 34 n += 1
加载全部内容