批量下载公众号文章
程序实现从公众号批量下载文章并保存为pdf.
代码:
# import requests
# import json
#
# url = "https://mp.weixin.qq.com/mp/profile_ext" # 公号的链接
# headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/83.0.4103.116 Safari/537.36 "}
# proxies = {
# 'https': None,
# 'http': None,
# }
# uin = 'MjU1Mjg4NjAzNQ=='
# # Key会随时间变化,需要获取最新的key,不然抓取不到数据
# key = '3390dbef8687839b29dd8a2499d96e7ee28002a981da4f1d962e56d4c93430eef1b27ae1dc2e2304590ee88091fcb33e6f59df16f55cf9fa4516fa6ab77f5db7055f877e3b48007fc57fc07a522d0b0d1d15c3ee0ca047f22cd49301079a0e10b01df3fe4069bbdb4dd2cbe38cfa1dcb3ac01a93dd04c8e01504896fcab7d8be'
# # 重要参数
# params = {
# 'action': 'getmsg',
# '__biz': 'MzU4ODg3MzAwNw==', # 每个公众号的不一样,从Fiddler复制过来
# 'f': 'json',
# 'offset': 0, # 控制翻页
# 'count': '10',
# 'is_ok': '1',
# 'scene': '124',
# 'uin': uin, # 每个公众号的不一样,从Fiddler复制过来
# 'key': key, # 每个公众号的不一样,从Fiddler复制过来
# 'wxtoken': '',
# 'x5': '0',
# }
#
# # 获取公众号页面信息
# response = requests.get(url, headers=headers, params=params, proxies=proxies)
# # print(response.text)
# # print(type(response.text))
# response_dict = json.loads(response.text) # 将字符串转换成python能识别的格式
# # print(response_dict)
#
# general_msg_list = response_dict['general_msg_list'] # 此时是字符串
# # print(general_msg_list)
# data_list = json.loads(general_msg_list)['list'] # 转换成python能够识别的格式,并取出文章链接的信息
# print(data_list)
# print(data_list[0])
import requests
import json
import time
def article_links(index):
"""用于获取单个页面的文章链接,index用于控制页数"""
links = []
url = 'https://mp.weixin.qq.com/mp/profile_ext'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/83.0.4103.116 Safari/537.36 "}
proxies = {
'https': None,
'http': None,
}
uin = 'MjU1Mjg4NjAzNQ=='
# # Key会随时间变化,需要获取最新的key,不然抓取不到数据
key = '3390dbef8687839b29dd8a2499d96e7ee28002a981da4f1d962e56d4c93430eef1b27ae1dc2e2304590ee88091fcb33e6f59df16f55cf9fa4516fa6ab77f5db7055f877e3b48007fc57fc07a522d0b0d1d15c3ee0ca047f22cd49301079a0e10b01df3fe4069bbdb4dd2cbe38cfa1dcb3ac01a93dd04c8e01504896fcab7d8be'
# 重要参数
params = {
'action': 'getmsg',
'__biz': 'MzU4ODg3MzAwNw==', # 每个公众号的不一样,从Fiddler复制过来
'f': 'json',
'offset': 0, # 控制翻页
'count': '10',
'is_ok': '1',
'scene': '124',
'uin': uin, # 每个公众号的不一样,从Fiddler复制过来
'key': key, # 每个公众号的不一样,从Fiddler复制过来
'wxtoken': '',
'x5': '0',
}
response = requests.get(url, headers=headers, params=params, proxies=proxies)
response_dict = json.loads(response.text)
can_msg_continue = response_dict.get('can_msg_continue') # 用于判断是否是最后一页,若为0,则表示最后一页
general_msg_list = response_dict.get('general_msg_list')
data_list = json.loads(general_msg_list)['list']
for data in data_list:
# 部分公众号文章是“消息”,因此data里的数据缺少我们需要的字段,需要跳过
try:
title = data['app_msg_ext_info']['title'] # 文章标题
datetime = data['comm_msg_info']['datetime'] # 获取时间戳
date = time.strftime('%Y-%m-%d', time.localtime(datetime)) # 将时间戳转换为本地时间
url = data['app_msg_ext_info']['content_url'] # 文章链接
info = {
"url": url,
"title": title,
"date": date,
}
links.append(info)
except:
pass
return links, can_msg_continue
all_links = []
for i in range(100): # 根据实际而定,因为每页10篇文章,100页对应1000篇文章
all_links.extend(article_links(i)[0])
if article_links(i)[1] == 0:
break
print(all_links[:])
print(len(all_links))
import pdfkit
import wechatsogou
def link_to_pdf(url, title, date):
ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3) # 调用接口
content_info = ws_api.get_article_content(url) # 请求链接
content = content_info['content_html'] # 转换为html格式
# 将标题和文章内容放入如下html中处理以下
html = f'''
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{title}</title>
</head>
<body>
<h2 style="text-align: center;font-weight: 400;">{title}</h2>
{content}
</body>
</html>'''
path_wkthmltopdf = r"E:\my\python_project\70个python实战项目\实例70_Python批量将公号文章保留原格式下载为PDF\wkhtmltox\bin\wkhtmltopdf.exe" # wkhtmltopdf可执行程序路径
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf) # 配置pdfkit
pdfkit.from_string(html, f"{title} {date}.pdf", configuration=config) # 转PDF,并按设定好的命名文件
print(f"{title}.pdf 已下载")
for link in all_links:
url = link['url']
title = link['title']
date = link['date']
link_to_pdf(url, title, date)