Python爬取博客的所有文章并存为带目录的word文档
Python爬取博客的所有文章并存为带目录的word文档,结果非常美丽!从此阅读博客文章轻松多了!!!
实现代码:
import requests
from bs4 import BeautifulSoup
url = f'http://blog.sina.com.cn/s/articlelist_5119330124_0_1.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.content)
# 获取当页所有文章的标题和链接
# print(soup.select('.atc_title'))
# 获取当页所有文章的发表时间
# print(soup.select('.atc_tm'))
# print(soup.select('.atc_title')[0].select('a'))
# [<a href="http://blog.sina.com.cn/s/blog_13122c74c0102zdsy.html" target="_blank" title="">投资难在慢成毁于速成</a>]
# print(soup.select('.atc_title')[0].select('a')[0].get("href"))
# http://blog.sina.com.cn/s/blog_13122c74c0102zdsy.html
# print(soup.select('.atc_title')[0].select('a')[0].text)
# print(soup.select('.atc_tm')[0].text)
# 获取所有博客文章的链接
import requests
from bs4 import BeautifulSoup
all_links = {}
for i in range(1, 6):
url = f'http://blog.sina.com.cn/s/articlelist_5119330124_0_{i}.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.content)
links = soup.select('.atc_title')
times = soup.select('.atc_tm')
for i in range(len(links)):
http_link = links[i].select('a')[0].get('href')
title = links[i].text.strip()
time = times[i].text
all_links[title] = [http_link, time]
# print(len(all_links))
# 获取单篇文章中的文字
url = 'http://blog.sina.com.cn/s/blog_13122c74c0102zbt3.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.content)
article = soup.select(".articalContent.newfont_family")
# print(article)
# print(article[0].text)
# print(article[0].text.replace("\xa0", ""))
# 获取单篇文章中的图片链接
url = 'http://blog.sina.com.cn/s/blog_13122c74c0102zbud.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.content)
img_link = soup.select(".articalContent.newfont_family")[0].find_all("img")[0].get("real_src")
# 图片下载函数
def downloadImg(img_url, file_path):
req = requests.get(url=img_url)
with open(file_path, 'wb') as f:
f.write(req.content)
downloadImg(r'http://s8.sinaimg.cn/middle/005AsbCIzy7vEfdM1M599',
r'..\实例67_Python爬取博客的所有文章并存为带目录的word文档\1.jpg')
# 写入标题,内容到word文件
import docx
from docx.oxml.ns import qn # 用于应用中文字体
def to_word(all_links):
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"}
doc = docx.Document() # 新建word文档
doc.styles['Normal'].font.name = u'宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
counter = 0 # 计数器,用于记录写入word的文章数
for title in all_links.keys():
doc.add_heading(title, 1)
date = all_links[title][1][:10] # 只取日期,不要时间
doc.add_paragraph(date)
wb_data = requests.get(all_links[title][0], headers=header)
soup = BeautifulSoup(wb_data.content)
article = soup.select(".articalContent.newfont_family")
# 有些文章被加密,获取不到内容,此时article为空,所以加个if语句判断
if article:
text = article[0].text.replace("\xa0", "")
doc.add_paragraph(text)
print(f"写入文章 {title} 。")
counter += 1
print(f"共写入 {counter} 篇文章。")
doc.save("新浪微博文章.docx")
to_word(all_links)