70行代码爬取 查字典_笑话(多线程)
小毅i 人气:11 from queue import Queue, Empty 2 import threading 3 import requests 4 from pyquery import PyQuery 5 import time 6 7 index = 'https://www.chazidian.com' 8 list_page = index + '/xiaohua{}/{}' 9 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36' 12 } 13 timeout = 1 14 thread_quantity = 5 15 16 17 def get_url(queue): 18 for i in range(1, 75 + 1): 19 r = requests.get(list_page.format('', str(i)), headers=headers) 20 pq = PyQuery(r.text) 21 doc = pq('div.arctcot h3 a') 22 for j in doc.items(): 23 queue.put(index + j.attr('href')) 24 # print(doc) 25 pass 26 27 28 def get_content(queue): 29 try: 30 while True: 31 url = queue.get(timeout=timeout) 32 if 'https://' in url: 33 r = requests.get(url, headers=headers) 34 pq = PyQuery(r.text) 35 doc = pq('div.arctcot') 36 title = doc('a').text() 37 content = doc('div.article_detail').text() 38 img = doc('div.article_detail img').attr('src') 39 # print(title.text()) 40 # print(content.text()) 41 if title and img: 42 if not 'http://' in img: 43 print(url) 44 print({title: index + img}) 45 elif title and content: 46 if not (content in title): 47 print(url) 48 print({title: content}) 49 except Empty: 50 print('-' * 100) 51 print('抓取完毕') 52 53 54 def main(): 55 queue_ = Queue(maxsize=1000) 56 list_ = threading.Thread(target=get_url, args=(queue_,)) 57 list_.start() 58 if True: 59 for i in range(thread_quantity): 60 content = threading.Thread(target=get_content, args=(queue_,)) 61 content.start() 62 content.join() 63 list_.join() 64 65 66 if __name__ == '__main__': 67 start = time.time() 68 main() 69 end = time.time() 70 print('用时: ', end - start - timeout)
加载全部内容