0%

Raccon:下载公众号历史文章

Python爬虫

教程

登录微信公众号后台

在新建图文素材界面添加超链接

选择其他公众号链接内容,通过Chrome DevTools获取目标公众号ID(fakeid)和自己的cookie和token

替换main.py中四个参数(fakeid、token、cookie、subscriptionName)并执行

token
cookie
fakeid 公众号id
SubscriptionName 公众号名称

生成的csv/subscriptionName.csv文件包含文章标题和地址

在DownloadArticles.py修改待下载csv文件路径

filename = ‘D:\WxArticles\灼识新维度.csv’

执行DownloadArticles.py下载文章,生成路径可自行修改

代码

替换fakeid、token、cookie、subscriptionName
main.py

1
2
3
4
5
6
7
8
9
10
from SearchArticles import SearchArticles

sa = SearchArticles()

cookie = "noticeLoginFlag=1; remember_acct=939571103@qq.com; appmsglist_action_3899606125=card; pgv_pvi=3658089472; RK=dAhgLO1zFn; ptcz=320bcb8913d9209a298c4d9755601f2ca537961f0e483181755d2561930f0551; pgv_pvid=1893749385; pac_uid=0_f6bdab6174356; ua_id=iGm30scGY5BGw01dAAAAAOIgUEv1oMHVMTX51uXBU8U=; wxuin=17542032848578; mm_lang=zh_CN; bizuin=3899606125; noticeLoginFlag=1; remember_acct=939571103@qq.com; rand_info=CAESID/VFuExB9FpR4vLJPb0I0hVYKc9wbB0yCyXZ5RfrOfw; slave_bizuin=3899606125; data_bizuin=3899606125; data_ticket=H3VfTdKW2kFiosn0VSsk4ovvG9eRXroxkD0/4DB/nIShnVfUMx+H7HWUwZW74FSe; slave_sid=ZWNUZ3NDUF9xMGF6ZDFkVmZTNnVncGFGcDBncWZTWld0dG1HaHNfUjFKa29lVzlrNDF6U3Q4a0RGZHJpQXo5R1lGX1E1Y1dsYjRTd1MyMWZLaU9yMEZlaFhqYnNRWkRRVmU0VDdQYk53XzFsUmVaZ2czaVNWY0FWeEJWZU9KcVJ4YUc3SnM4cFJGZHJ3WUZ3; slave_user=gh_fd75692e1901; xid=9876540b284ba5a0c7efc4efc2d131de; openid2ticket_obURv6BZFggp6EkxAA9EgvbjtKrc=fC16vfAqLKee3Aung6TSXBl3AqRwiY3BlClMUuA55ao="
token = "449184339"
fakeid = "MzU0Mjc2OTkzNQ==" # 公众号fakeid
subscriptionName = "灼识新维度" # 公众号名称

sa.Search(cookie, token, fakeid, subscriptionName)

查询公众号历史文章
SearchArticles.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import requests
import time
import pandas as pd


# 通过微信公众平台素材管理 查询公众号历史文章
# 根据自己的cookie和token进行更改
class SearchArticles():

def Search(self, cookie, token, fakeid, gzhname):
# 目标url
url = "https://mp.weixin.qq.com/cgi-bin/appmsg"

# 使用Cookie,跳过登录操作
headers = {
# "Cookie": "noticeLoginFlag=1; remember_acct=939571103@qq.com; appmsglist_action_3899606125=card; pgv_pvi=3658089472; RK=dAhgLO1zFn; ptcz=320bcb8913d9209a298c4d9755601f2ca537961f0e483181755d2561930f0551; pgv_pvid=1893749385; pac_uid=0_f6bdab6174356; ua_id=iGm30scGY5BGw01dAAAAAOIgUEv1oMHVMTX51uXBU8U=; wxuin=17542032848578; mm_lang=zh_CN; uuid=dc9de218a70caba323c66f6c73f64049; bizuin=3899606125; ticket=d4057fa40008ba0a4a6b06fdb1305ee8581b9d44; ticket_id=gh_fd75692e1901; cert=5IR7o9Jfyae8Vv4aZNEyP_ZMoA_M4bKe; rand_info=CAESIOIEo8jxeYjZgn5o2tJmJaZrCfX6tEo9ekwX3ns1X6Sa; slave_bizuin=3899606125; data_bizuin=3899606125; data_ticket=B3y1+wleyMv5XZ3btC/bTo7dl742x/EW4yBaI+Z0TlFjBNQdiVs7Fqh84kp063Ar; slave_sid=RVFNUE9fZFhNM25DSDZ3a1RveGxuRW9DUEtQMnhjYkFBS2hfTlpmVzd3VUJEMUhmMngxQjBJYjBOSGlDbktQaW9IRHdQeGFZVkRDODZmeU85SWRVdXFNR0dVRXl1ejdBVm51c2lmU3pIa1dMTzM1MFVzNDBVOHhtQ2R6T2I2MkF6eEJ1M0trVGdGVllVaUVY; slave_user=gh_fd75692e1901; xid=b93d1ea0d99d857e243c4e3c8e68d164; openid2ticket_obURv6BZFggp6EkxAA9EgvbjtKrc=pMBbA/39HQzR9igG41qgJbWSCMTZR+WvHqCxuXZEeQA=",
"Cookie": cookie,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
}

# fakeid是公众号id
data = {
# "token": "921232635",
"token": token,
"lang": "zh_CN",
"f": "json",
"ajax": "1",
"action": "list_ex",
"begin": "0",
"count": "5",
"query": "",
# "fakeid": "MzU0Mjc2OTkzNQ==",
"fakeid": fakeid,
"type": "9",
}

content_list = []
for i in range(50):
data["begin"] = i * 5
time.sleep(3)
# 使用get方法进行提交
content_json = requests.get(url, headers=headers, params=data).json()
# 返回了一个json,里面是每一页的数据
for item in content_json["app_msg_list"]:
# 提取每页文章的标题及对应的url
items = []
items.append(item["title"])
# items.append(item["digest"]) # 文章摘要
items.append(item["link"])
content_list.append(items)
print(i)

name = ['title', 'link']
# name = ['title', 'digest', 'link']
test = pd.DataFrame(columns=name, data=content_list)
saveCsvPath = "./csv/" + f'{gzhname}' + ".csv"
test.to_csv(saveCsvPath, mode='a', encoding='utf-8')
print("保存成功")

下载公众号文章
DownloadArticles.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd


# 通过url下载公众号文章

# 读取文件
def ReadFile(filePath):
with open(filePath, 'r', encoding='utf-8') as f:
all_the_text = f.read()
return all_the_text


# 保存文件函数
def SaveFile(filePath, fileContent):
with open(filePath, 'w', encoding='utf-8') as f:
f.write(fileContent)


# 将图片从远程下载到本地
def DownloadImg(url, savePath):
r = requests.get(url)
with open(savePath, 'wb') as f:
f.write(r.content)


# 修改网页中图片的src,使图片能正常显示
def ChangeImgSrc(htmlSource, SubscriptionName, dirnum=0):
bs = BeautifulSoup(htmlSource, "lxml") # 由网页源代码生成BeautifulSoup对象
imgList = bs.find_all("img") # 找出网页中所有img标签
imgIndex = 0 # 图片编号,不同图片要保存为不同名称
dirs1 = 'D:\\Raccon\\%s\\images\\%s' % (str(SubscriptionName), str(dirnum))
print(dirs1)
if not os.path.exists(dirs1):
os.makedirs(dirs1)
for img in imgList:
imgIndex += 1
originalUrl = "" # 定义一个变量保存图片真实url
if "data-src" in img.attrs: # 防止有的img标签中可能没有data-src而出错
originalUrl = img.attrs["data-src"]
elif "src" in img.attrs: # 如果有src则提取出来
originalUrl = img.attrs["src"]
else:
originalUrl = ""
if originalUrl.startswith("//"): # 如果url以//开始,则需要添加http
originalUrl = "http:" + originalUrl

if len(originalUrl) > 0: # 有图片网址则下载该图片
print(originalUrl)
if "data-type" in img.attrs:
imgType = img.attrs["data-type"] # 文件扩展名
else:
imgType = "png" # 没有扩展名则默认png
imgName = str(imgIndex) + "." + imgType
imgSavePath = "D:/Raccon/" + f'{SubscriptionName}/' + "images/" + f'{dirnum}/' + imgName # 图片保存目录
DownloadImg(originalUrl, imgSavePath) # 下载图片
img.attrs["src"] = "images/" + f'{dirnum}/' + imgName # 网页中图片的相对路径
else:
img.attrs["src"] = ""

return str(bs) # 将BeautifulSoup对象转为字符串,用于保存


# 下载url网页并保存
def DownloadHtml(url):
# 构造请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
}

# 模拟浏览器发送请求
response = requests.get(url, headers=headers)
if response.status_code == 200: # 返回码为200表示正常返回
htmltext = response.text # 网页正文
# print(htmltext)
return htmltext
else:
return None


# 从csv文件路径截取文件名
def splitName(filename):
if filename.find('\\') > 0:
SubscriptionName = filename.split('\\')[-1].split('.')[0]
return SubscriptionName
else:
return 'test'


if __name__ == '__main__':

filename = 'D:\\WxArticles\\灼识新维度.csv'

SubscriptionName = splitName(filename)

# 判断文件夹是否存在,不存在则创建
dirs2 = 'D:\\Raccon\\%s' % str(SubscriptionName)
if not os.path.exists(dirs2):
os.makedirs(dirs2)

num = 0
with open(filename, 'r', encoding='utf-8') as f:
num = len(f.readlines()) - 1
print(num)

# 读取articles.csv
df = pd.read_csv(filename)

# 下载每篇文章
for i in range(num):
print("开始下载第" + str(i) + "篇文章")
print(df["title"][i])
# print(df["link"][i])

title = df["title"][i].replace('/', '').replace('|', '')
url = df["link"][i]
htmlStr = DownloadHtml(url)
htmlStr2 = ChangeImgSrc(htmlStr, SubscriptionName, i)
savePath = "D:/Raccon/" + f'{SubscriptionName}/' + title + ".html"
SaveFile(savePath, htmlStr2)

github项目地址

源码
https://github.com/OllieGo/Raccon