1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
| import os import requests from bs4 import BeautifulSoup import pandas as pd
def ReadFile(filePath): with open(filePath, 'r', encoding='utf-8') as f: all_the_text = f.read() return all_the_text
def SaveFile(filePath, fileContent): with open(filePath, 'w', encoding='utf-8') as f: f.write(fileContent)
def DownloadImg(url, savePath): r = requests.get(url) with open(savePath, 'wb') as f: f.write(r.content)
def ChangeImgSrc(htmlSource, SubscriptionName, dirnum=0): bs = BeautifulSoup(htmlSource, "lxml") imgList = bs.find_all("img") imgIndex = 0 dirs1 = 'D:\\Raccon\\%s\\images\\%s' % (str(SubscriptionName), str(dirnum)) print(dirs1) if not os.path.exists(dirs1): os.makedirs(dirs1) for img in imgList: imgIndex += 1 originalUrl = "" if "data-src" in img.attrs: originalUrl = img.attrs["data-src"] elif "src" in img.attrs: originalUrl = img.attrs["src"] else: originalUrl = "" if originalUrl.startswith("//"): originalUrl = "http:" + originalUrl
if len(originalUrl) > 0: print(originalUrl) if "data-type" in img.attrs: imgType = img.attrs["data-type"] else: imgType = "png" imgName = str(imgIndex) + "." + imgType imgSavePath = "D:/Raccon/" + f'{SubscriptionName}/' + "images/" + f'{dirnum}/' + imgName DownloadImg(originalUrl, imgSavePath) img.attrs["src"] = "images/" + f'{dirnum}/' + imgName else: img.attrs["src"] = ""
return str(bs)
def DownloadHtml(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' }
response = requests.get(url, headers=headers) if response.status_code == 200: htmltext = response.text return htmltext else: return None
def splitName(filename): if filename.find('\\') > 0: SubscriptionName = filename.split('\\')[-1].split('.')[0] return SubscriptionName else: return 'test'
if __name__ == '__main__':
filename = 'D:\\WxArticles\\灼识新维度.csv'
SubscriptionName = splitName(filename)
dirs2 = 'D:\\Raccon\\%s' % str(SubscriptionName) if not os.path.exists(dirs2): os.makedirs(dirs2)
num = 0 with open(filename, 'r', encoding='utf-8') as f: num = len(f.readlines()) - 1 print(num)
df = pd.read_csv(filename)
for i in range(num): print("开始下载第" + str(i) + "篇文章") print(df["title"][i])
title = df["title"][i].replace('/', '').replace('|', '') url = df["link"][i] htmlStr = DownloadHtml(url) htmlStr2 = ChangeImgSrc(htmlStr, SubscriptionName, i) savePath = "D:/Raccon/" + f'{SubscriptionName}/' + title + ".html" SaveFile(savePath, htmlStr2)
|