victory的博客

长安一片月,万户捣衣声

0%

requests | 爬取精美壁纸

爬取精美壁纸

爬取目标: Wallhaven网站中Toplist标签下前50页共1196张精美壁纸。
参考资料

代码:

# -*- coding:utf-8 -*-
import os
import requests
from lxml import etree


# 1.获取每页的html信息
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/95.0.4638.54 Safari/537.36 '
}


# 定义获取每页html信息的函数
def get_html_info(page):
    url = f'https://wallhaven.cc/toplist?page={page}'
    resp = requests.get(url, headers=headers)
    print(resp.text)
    resp_html = etree.HTML(resp.text)
    return resp_html


# 2.解析url信息+下载图片
def get_pic(resp_html):
    pic_url_list = []
    lis = resp_html.xpath('//*[@id="thumbs"]/section[1]/ul/li')  # 获取该页所有缩略图包含的信息
    for li in lis:
        pic_url = li.xpath('./figure/a/@href')[0]  # 获取存放在缩略图信息中的缩略图原图网址
        pic_url_list.append(pic_url)
    for pic_url in pic_url_list:
        resp2 = requests.get(pic_url, headers=headers)
        r_html2 = etree.HTML(resp2.text)
        pic_size = r_html2.xpath('//*[@id="showcase-sidebar"]/div/div[1]/h3/text()')[0]  # 用照片分辨率作为名称一部分
        final_url = r_html2.xpath('//*[@id="wallpaper"]/@src')[0]  # 获取原图下载地址
        pic = requests.get(url=final_url, headers=headers).content
        if not os.path.exists('Wallhaven'):
            os.mkdir('Wallhaven')
        with open('Wallhaven\\' + pic_size + final_url[-10:], mode='wb') as f:
            f.write(pic)  # 保存图片
            print(pic_size + final_url[-10:] + ',下载完毕,已下载{}张壁纸'.format(len(os.listdir('Wallhaven'))))


# 3.构建main()函数运行程序
def main():
    page_range = range(1,51) # 爬取1-50页的壁纸
    for i in page_range:
        r = get_html_info(i)
        get_pic(r)
        print(f'===============第{i}页下载完毕=============')


if __name__ == '__main__':
    main()