公眾號歷史文章采集
前言:采集公眾號歷史文章,且鏈接永久有效,親測2年多無壓力。
1.先在 https://mp.weixin.qq.com/ 注冊一個個人版使用公眾號,供后續(xù)使用。
2.點擊左側(cè)圖文素材,新的創(chuàng)作,寫新圖文。
3.點擊超鏈接后,填入要查詢的公眾號。
文章來源:http://www.zghlxwxcb.cn/news/detail-603420.html
4.一個小知識點,通過xpath拿到html源碼,并提取正文。
def get_html_code(parseHtml, url, codeXpath):
code_html = parseHtml.xpath(codeXpath)
html = ''
for i in code_html:
# etree.tostring() #輸出修正后的html代碼,byte格式
# 轉(zhuǎn)成utf-8格式,然后decode進(jìn)行encoding 指定的編碼格式解碼字符串
html += etree.tostring(i, encoding='utf-8').decode()
return html
5.F12抓包分析,發(fā)現(xiàn)token和cookie是綁定用戶的,生成代碼。
搜索公眾號接口:query= xxxx文章鏈接及翻頁接口: fakeid= 搜索公眾號對應(yīng)的id base64編碼
若未固定公眾號,可將該公眾號id做映射表存放,減少請求文章來源地址http://www.zghlxwxcb.cn/news/detail-603420.html
# encoding:utf-8
import json
import math
import time
from spiders.BaseSpider import BaseSpider
from mysqldb.mysql_util import select_link
from parse.requests_parse import requestsParse
from WX.WeChat_util import parse_bjnews, parseWechat
class WeChatSpider(BaseSpider):
"""
定義url和請求頭
"""
token = "你的token"
def __init__(self):
super().__init__()
self.cookies = "你的cookie"
self.params = {
'action': 'search_biz',
'begin': '0',
'count': '50',
'query': '',
'fakeid': None,
'type': 9,
'token': self.token,
'lang': 'zh_CN',
'f': 'json',
'ajax': '1'
}
def next_type(self, token, source, apartment, province, city, district, label):
begin, count = 0, 1
self.params["action"] = 'list_ex'
self.params["query"] = ''
content = self.get_information('https://mp.weixin.qq.com/cgi-bin/appmsg')
page = int(math.ceil(json.loads(content).get("app_msg_cnt") / 5))
#print(page)
for index in range(page):
if index % 3 == 0:
time.sleep(60)
try:
if index:
begin += 5
self.params["begin"] = begin
print(f"正在獲取第{begin}頁數(shù)")
content = self.get_information('https://mp.weixin.qq.com/cgi-bin/appmsg')
nextLink = parse_bjnews(content)
for nextUrl in nextLink:
if not token:
# select_link 為查詢數(shù)據(jù)庫是否存在 增量采集
if not select_link(link=nextUrl):
break
self.nextUrl_q.put(nextUrl)
if self.nextUrl_q.empty():
return
#日常采集 requestsParse()為解析方法
t = requestsParse(self.nextUrl_q, source, apartment, province, city, district,
titleXpath='//h2[@id="activity-name"]/text()',
codeXpath='//*[@id="js_content"]', labeel=label)
if t :
return
except Exception as e:
print("Spider WeChat Main Error= " + str(e) + " Spider WeChat success page= " + str(
count) + ' ' + source)
return
def main(self, source, apartment, province, city, district, label):
token = 0 #token為0為增量 1為全量 與select_link()做關(guān)聯(lián)
self.params["query"] = source
html = self.get_information('https://mp.weixin.qq.com/cgi-bin/searchbiz')
print(html)
self.params["fakeid"] = parseWechat(html)
self.next_type(token, source, apartment, province, city, district, label)
if __name__ == '__main__':
WeChatSpider().main("公眾號名稱", "wechat", "省份", "市", "區(qū)",label="industrial_economy_policy")
6.貼出工具類中兩個方法.
def parse_bjnews(content):
content = json.loads(content)
nextLink = []
for link in content.get("app_msg_list"):
# if titleParse(link.get("title")) and title_Wechat(link.get("title")):
nextLink.append(link.get("link"))
return nextLink
def parseWechat(html):
try:
html = json.loads(html)
fakeid = html.get("list")[0].get("fakeid")
return fakeid
except:
return
到了這里,關(guān)于公眾號歷史文章采集的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!