python实现统计博客园博文数量、阅读量、评论数

时间:2020-03-11 -零人气:0

如何使用

1.将代码中的headurl替换成你的博客链接即可。

类似：
https://www.cnblogs.com/-wenlihttps://img.qb5200.com/download-x/default.html?page=

2.爬取页数限制，看自己更新多少页博文。

while i<55

源码

from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
import re

def get_one_page(url,headers):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        return None
def parse_one_page(html):
    global item
    soup = BeautifulSoup(html, 'lxml')
    divs = soup.find_all('div',class_='day')

    for i, child in enumerate(divs):
        list = []
        i=0
        dic = {}
        titles = child.find_all('div',class_='postTitle')
        infomations = child.find_all('div', class_='postDesc')
        for title in titles:
            partitle = title.find_all('a',class_='postTitle2')
            partitleinfo = partitle[0].get_text()
            partitleinfo = partitleinfo.replace('\n', '')
            partitleinfo = partitleinfo.replace(' ', '')
            #print(partitleinfo)
            list.append(partitleinfo)
        for infomation in infomations:
            info = infomation.get_text() #获得文本
            info = info.replace('\n', '') #去掉换行
            info = info.replace(' ', '') #去掉空白字符
            result = re.match('^.*阅读.(\d+)..*评论.(\d+)..*编辑$', info)
            # print(result.group(1))
            # print(result.group(2))
            dic["阅读量"] = result.group(1)
            dic["评论量"] = result.group(2)
            item[list[i]] = dic
            i+=1
def statistics():
    global item
    readtotal = 0
    commandtotal = 0
    blogtotal = 0
    for v in item.values():
        readtotal = readtotal + int(v['阅读量'])
        commandtotal = commandtotal +  int(v['评论量'])
        blogtotal += 1
    print('总博文量：', blogtotal)
    print('总阅读量：',readtotal)
    print('总评论量：', commandtotal)

def main():
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'}
    headurl = ''//此处链接是你的博客园博客链接
    i = 1
    while i<55:
        url = headurl + str(i)
        print(url)
        #获取源码
        html = get_one_page(url,headers)
        #解析源码
        parse_one_page(html)
        i += 1
    #统计功能
    #print(item)
    statistics()


if __name__ == '__main__':
    item = {}
    main()

加载全部内容