Python网页文件转PDF PNG

时间:2022-05-23 幸福的达哥人气:1

一、html网页文件转pdf

#将HTML文件导出为PDF
def html_to_pdf(html_path,pdf_path='.\\pdf_new.pdf',html_encoding='UTF-8',path_wkpdf = r'.\Tools\wkhtmltopdf.exe'):
    '''
    将HTML文件导出为PDF
    
    :param html_path:str类型，目标HTML文件的路径，可以是一个路径，也可以是多个路径，以list方式传入路径；或者一个或者多个网址；或者为一个字符串
    
    :param pdf_path:str类型，需要导出的PDF文件的路径
    
    :param html_encoding:str类型，html的编码格式，具体要看html页面到底是以什么编码格式保存的
    
    :param path_wkpdf:str类型，path_wkpdf = r'.\Tools\wkhtmltopdf.exe'  # 工具路径
    :return:
    '''
    cfg = pdfkit.configuration(wkhtmltopdf=path_wkpdf)
    options = {
        "encoding": html_encoding  # 这个具体要看html页面到底是以什么编码格式保存的
    }
 
    if 'http' in str(html_path) and ('html' not in str(html_path) or 'HTML' not in str(html_path)):     #判断是否为非网址
        #从url获取html，再转为pdf
        print('http=>pdf')
        # pdfkit.from_url('https://httpbin.org/ip', 'ip.pdf', options=options, configuration=cfg)
        # pdfkit.from_url(['https://httpbin.org/ip', 'https://httpbin.org/ip'], 'ip.pdf', options=options,configuration=cfg)  # 传入列表
        pdfkit.from_url(html_path, pdf_path, options=options, configuration=cfg)
        
    elif 'html' in str(html_path) or 'HTML' in str(html_path):          #判断是否为HTML文件
        #将html文件转为pdf
        print('html,str=>pdf')
        # pdfkit.from_file(r'./helloworld.html', 'helloworld.pdf',options=options,  configuration=cfg)
        pdfkit.from_file(html_path, pdf_path, options=options, configuration=cfg)
        
    elif isinstance(html_path, list) and ('html' in str(html_path) or 'HTML' in str(html_path)):   #判断html目标是否为list,
        # 如：[r'./helloworld.html', r'./111.html', r'./222.html']
        print('html,list=>pdf')
        pdfkit.from_file(html_path, pdf_path,options=options,  configuration=cfg)  # 传入列表
    
    else:
        #将字符串转为pdf
        print('from_string=>pdf')
        pdfkit.from_string(html_path, pdf_path,options=options,  configuration=cfg)

所需要用的附件程序：

wkhtmltopdf.exe

下载地址

二、html网页文件转png

#将HTML文件导出为图片
def html_to_png(html_path,pdf_path='.\\pdf_new.pdf',html_encoding='UTF-8',path_wkpdf = r'.\Tools\wkhtmltoimage.exe'):
    '''
    将HTML文件导出为图片
    
    :param html_path:str类型，目标HTML文件的路径，可以是一个路径，也可以是多个路径，以list方式传入路径；或者一个或者多个网址；或者为一个字符串
    
    :param pdf_path:str类型，需要导出的图片文件的路径
    
    :param html_encoding:str类型，html的编码格式，具体要看html页面到底是以什么编码格式保存的
    
    :param path_wkpdf:str类型，path_wkpdf = r'.\Tools\wwkhtmltoimage.exe'  # 工具路径
    :return:
    '''
    cfg = imgkit.config(wkhtmltoimage=path_wkpdf)
    options = {
        "encoding": html_encoding  # 这个具体要看html页面到底是以什么编码格式保存的
    }
 
    if 'http' in str(html_path) and ('html' not in str(html_path) or 'HTML' not in str(html_path)):     #判断是否为非网址
        #从url获取html，再转为pdf
        print('http=>png')
        # pdfkit.from_url('https://httpbin.org/ip', 'ip.png', options=options, configuration=cfg)
        # pdfkit.from_url(['https://httpbin.org/ip', 'https://httpbin.org/ip'], 'ip.png', options=options,configuration=cfg)  # 传入列表
        imgkit.from_url(html_path, pdf_path, options=options, config=cfg)
        
    elif 'html' in str(html_path) or 'HTML' in str(html_path):          #判断是否为HTML文件
        #将html文件转为pdf
        print('html,str=>png')
        # pdfkit.from_file(r'./helloworld.html', 'helloworld.png',options=options,  configuration=cfg)
        imgkit.from_file(html_path, pdf_path, options=options, config=cfg)
        
    elif isinstance(html_path, list) and ('html' in str(html_path) or 'HTML' in str(html_path)):   #判断html目标是否为list,
        # 如：[r'./helloworld.html', r'./111.html', r'./222.html']
        print('html,list=>png')
        imgkit.from_file(html_path, pdf_path,options=options,  config=cfg)  # 传入列表
    
    else:
        #将字符串转为pdf
        print('from_string=>png')
        imgkit.from_string(html_path, pdf_path,options=options,  config=cfg)

所需要用的附件程序：

wkhtmltoimage.exe

下载地址

加载全部内容