victory的博客

长安一片月,万户捣衣声

0%

requests | 批量下载公众号文章

批量下载公众号文章

程序实现从公众号批量下载文章并保存为pdf.

代码:

# import requests
# import json
#
# url = "https://mp.weixin.qq.com/mp/profile_ext"  # 公号的链接
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
#                          "Chrome/83.0.4103.116 Safari/537.36 "}
# proxies = {
#     'https': None,
#     'http': None,
# }
# uin = 'MjU1Mjg4NjAzNQ=='
# # Key会随时间变化,需要获取最新的key,不然抓取不到数据
# key = '3390dbef8687839b29dd8a2499d96e7ee28002a981da4f1d962e56d4c93430eef1b27ae1dc2e2304590ee88091fcb33e6f59df16f55cf9fa4516fa6ab77f5db7055f877e3b48007fc57fc07a522d0b0d1d15c3ee0ca047f22cd49301079a0e10b01df3fe4069bbdb4dd2cbe38cfa1dcb3ac01a93dd04c8e01504896fcab7d8be'
# # 重要参数
# params = {
#     'action': 'getmsg',
#     '__biz': 'MzU4ODg3MzAwNw==',  # 每个公众号的不一样,从Fiddler复制过来
#     'f': 'json',
#     'offset': 0,  # 控制翻页
#     'count': '10',
#     'is_ok': '1',
#     'scene': '124',
#     'uin': uin,  # 每个公众号的不一样,从Fiddler复制过来
#     'key': key,  # 每个公众号的不一样,从Fiddler复制过来
#     'wxtoken': '',
#     'x5': '0',
# }
#
# # 获取公众号页面信息
# response = requests.get(url, headers=headers, params=params, proxies=proxies)
# # print(response.text)
# # print(type(response.text))
# response_dict = json.loads(response.text)  # 将字符串转换成python能识别的格式
# # print(response_dict)
#
# general_msg_list = response_dict['general_msg_list']  # 此时是字符串
# # print(general_msg_list)
# data_list = json.loads(general_msg_list)['list']  # 转换成python能够识别的格式,并取出文章链接的信息
# print(data_list)

# print(data_list[0])

import requests
import json
import time


def article_links(index):
    """用于获取单个页面的文章链接,index用于控制页数"""
    links = []
    url = 'https://mp.weixin.qq.com/mp/profile_ext'
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                             "Chrome/83.0.4103.116 Safari/537.36 "}
    proxies = {
        'https': None,
        'http': None,
    }
    uin = 'MjU1Mjg4NjAzNQ=='
    # # Key会随时间变化,需要获取最新的key,不然抓取不到数据
    key = '3390dbef8687839b29dd8a2499d96e7ee28002a981da4f1d962e56d4c93430eef1b27ae1dc2e2304590ee88091fcb33e6f59df16f55cf9fa4516fa6ab77f5db7055f877e3b48007fc57fc07a522d0b0d1d15c3ee0ca047f22cd49301079a0e10b01df3fe4069bbdb4dd2cbe38cfa1dcb3ac01a93dd04c8e01504896fcab7d8be'
    # 重要参数
    params = {
        'action': 'getmsg',
        '__biz': 'MzU4ODg3MzAwNw==',  # 每个公众号的不一样,从Fiddler复制过来
        'f': 'json',
        'offset': 0,  # 控制翻页
        'count': '10',
        'is_ok': '1',
        'scene': '124',
        'uin': uin,  # 每个公众号的不一样,从Fiddler复制过来
        'key': key,  # 每个公众号的不一样,从Fiddler复制过来
        'wxtoken': '',
        'x5': '0',
    }
    response = requests.get(url, headers=headers, params=params, proxies=proxies)
    response_dict = json.loads(response.text)
    can_msg_continue = response_dict.get('can_msg_continue')  # 用于判断是否是最后一页,若为0,则表示最后一页
    general_msg_list = response_dict.get('general_msg_list')
    data_list = json.loads(general_msg_list)['list']
    for data in data_list:
        # 部分公众号文章是“消息”,因此data里的数据缺少我们需要的字段,需要跳过
        try:
            title = data['app_msg_ext_info']['title']  # 文章标题
            datetime = data['comm_msg_info']['datetime']  # 获取时间戳
            date = time.strftime('%Y-%m-%d', time.localtime(datetime))  # 将时间戳转换为本地时间
            url = data['app_msg_ext_info']['content_url']  # 文章链接
            info = {
                "url": url,
                "title": title,
                "date": date,
            }
            links.append(info)
        except:
            pass
    return links, can_msg_continue


all_links = []
for i in range(100):  # 根据实际而定,因为每页10篇文章,100页对应1000篇文章
    all_links.extend(article_links(i)[0])
    if article_links(i)[1] == 0:
        break
print(all_links[:])

print(len(all_links))


import pdfkit
import wechatsogou


def link_to_pdf(url, title, date):
    ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)  # 调用接口
    content_info = ws_api.get_article_content(url)  # 请求链接
    content = content_info['content_html']  # 转换为html格式
    # 将标题和文章内容放入如下html中处理以下
    html = f'''
            <!DOCTYPE html>
            <html lang="en">
            <head>
                <meta charset="UTF-8">
                <title>{title}</title>
            </head>
            <body>
            <h2 style="text-align: center;font-weight: 400;">{title}</h2>
            {content}
            </body>
            </html>'''
    path_wkthmltopdf = r"E:\my\python_project\70个python实战项目\实例70_Python批量将公号文章保留原格式下载为PDF\wkhtmltox\bin\wkhtmltopdf.exe"  # wkhtmltopdf可执行程序路径
    config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)  # 配置pdfkit
    pdfkit.from_string(html, f"{title} {date}.pdf", configuration=config)  # 转PDF,并按设定好的命名文件
    print(f"{title}.pdf 已下载")


for link in all_links:
    url = link['url']
    title = link['title']
    date = link['date']
    link_to_pdf(url, title, date)