victory的博客

长安一片月,万户捣衣声

0%

requests | Python爬取博客的所有文章并存为带目录的word文档

Python爬取博客的所有文章并存为带目录的word文档

Python爬取博客的所有文章并存为带目录的word文档,结果非常美丽!从此阅读博客文章轻松多了!!!

实现代码:

import requests
from bs4 import BeautifulSoup

url = f'http://blog.sina.com.cn/s/articlelist_5119330124_0_1.html'
wb_data = requests.get(url)

soup = BeautifulSoup(wb_data.content)

# 获取当页所有文章的标题和链接
# print(soup.select('.atc_title'))

# 获取当页所有文章的发表时间
# print(soup.select('.atc_tm'))

# print(soup.select('.atc_title')[0].select('a'))
# [<a href="http://blog.sina.com.cn/s/blog_13122c74c0102zdsy.html" target="_blank" title="">投资难在慢成毁于速成</a>]
# print(soup.select('.atc_title')[0].select('a')[0].get("href"))
# http://blog.sina.com.cn/s/blog_13122c74c0102zdsy.html
# print(soup.select('.atc_title')[0].select('a')[0].text)

# print(soup.select('.atc_tm')[0].text)

# 获取所有博客文章的链接
import requests
from bs4 import BeautifulSoup

all_links = {}
for i in range(1, 6):
    url = f'http://blog.sina.com.cn/s/articlelist_5119330124_0_{i}.html'
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.content)
    links = soup.select('.atc_title')
    times = soup.select('.atc_tm')
    for i in range(len(links)):
        http_link = links[i].select('a')[0].get('href')
        title = links[i].text.strip()
        time = times[i].text
        all_links[title] = [http_link, time]

# print(len(all_links))

# 获取单篇文章中的文字
url = 'http://blog.sina.com.cn/s/blog_13122c74c0102zbt3.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.content)
article = soup.select(".articalContent.newfont_family")
# print(article)
# print(article[0].text)
# print(article[0].text.replace("\xa0", ""))

# 获取单篇文章中的图片链接
url = 'http://blog.sina.com.cn/s/blog_13122c74c0102zbud.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.content)
img_link = soup.select(".articalContent.newfont_family")[0].find_all("img")[0].get("real_src")


# 图片下载函数
def downloadImg(img_url, file_path):
    req = requests.get(url=img_url)
    with open(file_path, 'wb') as f:
        f.write(req.content)


downloadImg(r'http://s8.sinaimg.cn/middle/005AsbCIzy7vEfdM1M599',
            r'..\实例67_Python爬取博客的所有文章并存为带目录的word文档\1.jpg')

# 写入标题,内容到word文件
import docx
from docx.oxml.ns import qn  # 用于应用中文字体


def to_word(all_links):
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"}
    doc = docx.Document()  # 新建word文档
    doc.styles['Normal'].font.name = u'宋体'
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

    counter = 0  # 计数器,用于记录写入word的文章数
    for title in all_links.keys():
        doc.add_heading(title, 1)
        date = all_links[title][1][:10]  # 只取日期,不要时间
        doc.add_paragraph(date)
        wb_data = requests.get(all_links[title][0], headers=header)
        soup = BeautifulSoup(wb_data.content)
        article = soup.select(".articalContent.newfont_family")
        # 有些文章被加密,获取不到内容,此时article为空,所以加个if语句判断
        if article:
            text = article[0].text.replace("\xa0", "")
            doc.add_paragraph(text)
            print(f"写入文章 {title} 。")
            counter += 1
    print(f"共写入 {counter} 篇文章。")
    doc.save("新浪微博文章.docx")


to_word(all_links)