微信文章批量下载|单网页链接全转存word

昨天接到老师给的保存微信文章的任务,但是内容实在太多了所以我直接写了一个爬虫来保存至word文档,由于流程比较简单,所以我直接提供完整代码。

[python]
# -*- coding: utf-8 -*-
from docx import Document
doc = Document()
doc.add_heading('十九大以来人民日报重要评论理论文章合集', level=1)
import requests
import re
from bs4 import BeautifulSoup ##导入bs4中的BeautifulSoup
import chardet
p1 = r'(?<=<span style="max-width: 100%;box-sizing: border-box !important;word-wrap: break-word !important;">).+?(?=</span>)' # 这是抽取阅读数
pattern1 = re.compile(p1)
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
} ##浏览器请求头(大部分网站没有这个请求头会报错、请务必加上哦)
urls = "https://mp.weixin.qq.com/s/iM0y4ywGss5ASvBeqLG1zw"
try:
htmls = requests.get(urls,headers=headers)
Soups = BeautifulSoup(htmls.text, 'lxml') ##使用BeautifulSoup来解析我们获取到的网页(‘lxml’是指定的解析器 具体请参考官方文档哦)
#print(htmls.text)
a_list = Soups.find_all('a')
for a in a_list:
key = str(a)
matcher1 = re.search(pattern1, key)
if matcher1:
#print(matcher1.group(0))
#doc.add_heading(matcher1.group(0), level=1)
url = a['href']
html = requests.get(url,headers=headers)
#print(html.status_code)
if html.status_code == 404:
continue
#print(html.encoding)
text = html.text.encode("iso-8859-1").decode('gbk').encode('utf-8').decode()
#print(text)
Soup = BeautifulSoup(text, 'lxml')
## 检验文档结构
if not Soup.find(id='rwb_zw'):
if Soup.find('h1'):
title = Soup.find('h1').get_text().replace('\n', '').replace('\t', '')
print(title)
subtitle = ''
if Soup.find('p',class_='sou'):
fl = Soup.find('p',class_='sou').get_text().replace('\n', '').replace('\t', '')
print(fl)
paras = Soup.find("div", class_="show_text").find_all('p')
else:
## 提取文章标题、副标题和时间等内容
if Soup.find('h1'):
title = Soup.find('h1').get_text().replace('\n', '').replace('\t', '')
print(title)
if Soup.find('h4'):
subtitle = Soup.find('h4').get_text().replace('\n', '').replace('\t', '')
print(subtitle)
if Soup.find('div',class_='box01'):
fl = Soup.find('div',class_='box01').get_text().replace('\n', '').replace('\t', '')
print(fl)
paras = Soup.find(id='rwb_zw').find_all('p')
## 执行打印
doc.add_heading(title, level=1)
doc.add_heading(subtitle, level=2)
doc.add_heading(fl, level=3)
for para in paras:
p = para.get_text().replace('\n', '').replace('\t', '').replace(' ', '')
print(p)
doc.add_paragraph(p)
finally:
doc.save(u'C:/Users/leo/Desktop/十九大以来人民日报重要评论理论文章合集.docx')
[/python]

点赞

发表评论