Python爬取公众号文章

chuansong.me收录了很多微信公众号发送的文章,可以从这个网站查找想要爬取的文章。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# -*- coding: utf-8 -*-


import requests
from requests.exceptions import RequestException
import re
import time
import json
import random
import os



global count;
count=0;
def get_one_page(url):
#需要加一个请求头部,不然会被网站封禁
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status #若不为200,则引发HTTPError错误
response.encoding = response.apparent_encoding
return response.text
except:
return "产生异常"

def mkdir(offset):
global count;
path = os.getcwd()+'\\'+ str(offset)
isExists = os.path.exists(path)
path_csv = path +'\\'+ str(offset)+'.csv'
if not isExists:
os.makedirs(path)
with open(path_csv, 'w', encoding='utf-8') as f:
f.write('链接,标题,日期' + '\n') #注意,此处的逗号,应为英文格式
f.close()
else:
count+=1
print("已下载链接数:",count)
return path

def write_to_file(content, offset):
path = mkdir(offset) +'\\'+ str(offset)+'.csv'
with open(path, 'a', encoding='utf-8') as f: #追加存储形式,content是字典形式
f.write(str(json.dumps(content, ensure_ascii=False).strip('\'\"') + '\n')) #在写入
f.close()

def parse_one_page(html):
pattern = re.compile('<div class="feed_item_question">.*?<span>.*?<a class="question_link" href="(.*?)".*?_blank">(.*?)</a>.*?"timestamp".*?">(.*?)</span>', re.S)
items = re.findall(pattern, html)
return items

def judge_info(name):
url = 'http://chuansong.me/account/' + str(name) + '?start=' + str(0)
wait = round(random.uniform(1,2),2) # 设置随机爬虫间隔,避免被封
time.sleep(wait)
html = get_one_page(url)

pattern1 = re.compile('<h1>Page Not Found.</h1>', re.S)
item1 = re.findall(pattern1, html) # list类型

pattern2 = re.compile('<a href="/account/.*?">(.\d+)</a>(\s*)</span>(\s*?)<a href="/account/.*" style="float: right">下一页</a>')
item2 = re.findall(pattern2, html) # list类型


if item1:
print("\n---------该账号信息尚未收录--------\n")
exit();
else:
print("\n---------该公众号目前已收录文章页数N为:",item2[0][0])


def main(offset, i):
url = 'http://chuansong.me/account/' + str(offset) + '?start=' + str(12*i)
print(url)
wait = round(random.uniform(1,2),2) # 设置随机爬虫间隔,避免被封
time.sleep(wait)
html = get_one_page(url)
for item in parse_one_page(html):
info = 'http://chuansong.me'+item[0]+','+ item[1]+','+item[2]+'\n'
info = repr(info.replace('\n', ''))
#print(info)
#info.strip('\"') #这种去不掉首尾的“
#info = info[1:-1] #这种去不掉首尾的“
#info.Trim("".ToCharArray())
#info.TrimStart('\"').TrimEnd('\"')
write_to_file(info, offset)


if __name__ == "__main__":
print("\n说明:若程序很快退出,可能是输入的信息有错\n"
"\nAuthor:Ctipsy\n")
name = input("请输入公众号名称:")
judge_info(name);
pages = input("\n请输入需要抓取的文章页数(<N):")
for i in range(int(pages)):
main(name, i)

参考链接 https://blog.csdn.net/gonglun7465/article/details/81945271

踏浪 wechat
欢迎您扫一扫上面的微信公众号,订阅我的博客!
坚持原创技术分享,您的支持将鼓励我继续创作!