2023年12月3號更新版 修復問題
參加新聞比賽,需要獲取大眾對某一方面的態(tài)度信息,因此選擇微博作為信息收集的一部分
完整代碼
微博主體內容
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import json
# 設置為自己的cookies
cookies = {
'SINAGLOBAL': '1278126679099.0298.1694199077980',
'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.',
'SUB': '_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt',
'ALF': '1733137172',
'_s_tentry': 'weibo.com',
'Apache': '435019984104.0236.1701606621998',
'ULV': '1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048',
}
def get_the_list_response(q='話題', n='1', p='頁碼'):
headers = {
'authority': 's.weibo.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'referer': 'https://s.weibo.com/weibo?q=%23%E6%96%B0%E9%97%BB%E5%AD%A6%E6%95%99%E6%8E%88%E6%80%92%E6%80%BC%E5%BC%A0%E9%9B%AA%E5%B3%B0%23&nodup=1',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
}
params = {
'q': q,
'nodup': n,
'page': p,
}
response = requests.get('https://s.weibo.com/weibo', params=params, cookies=cookies, headers=headers)
return response
def parse_the_list(text):
soup = BeautifulSoup(text)
divs = soup.select('div[action-type="feed_list_item"]')
lst = []
for div in divs:
mid = div.get('mid')
time = div.select('div.card-feed > div.content > div.from > a:first-of-type')
if time:
time = time[0].string.strip()
else:
time = None
p = div.select('div.card-feed > div.content > p:last-of-type')
if p:
p = p[0].strings
content = '\n'.join([para.replace('\u200b', '').strip() for para in list(p)]).strip()
else:
content = None
star = div.select('ul > li > a > button > span.woo-like-count')
if star:
star = list(star[0].strings)[0]
else:
star = None
lst.append((mid, content, star, time))
df = pd.DataFrame(lst, columns=['mid', 'content', 'star', 'time'])
return df
def get_the_list(q, p):
df_list = []
for i in range(1, p+1):
response = get_the_list_response(q=q, p=i)
if response.status_code == 200:
df = parse_the_list(response.text)
df_list.append(df)
print(f'第{i}頁解析成功!', flush=True)
return df_list
if __name__ == '__main__':
# 先設置cookie,換成自己的;
q = '#華為發(fā)布會#'
p = 20
df_list = get_the_list(q, p)
df = pd.concat(df_list)
df.to_csv(f'{q}.csv', index=False)
微博評論內容
一級評論內容
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import json
# 設置為自己的cookies
cookies = {
'SINAGLOBAL': '1278126679099.0298.1694199077980',
'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.',
'SUB': '_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt',
'ALF': '1733137172',
'_s_tentry': 'weibo.com',
'Apache': '435019984104.0236.1701606621998',
'ULV': '1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048',
}
# 開始頁碼,不用修改
page_num = 0
def get_content_1(uid, mid, the_first=True, max_id=None):
headers = {
'authority': 'weibo.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'client-version': 'v2.43.30',
'referer': 'https://weibo.com/1762257041/NiSAxfmbZ',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'server-version': 'v2023.09.08.4',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_',
}
params = {
'is_reload': '1',
'id': f'{mid}',
'is_show_bulletin': '2',
'is_mix': '0',
'count': '20',
'uid': f'{uid}',
'fetch_level': '0',
'locale': 'zh-CN',
}
if not the_first:
params['flow'] = 0
params['max_id'] = max_id
else:
pass
response = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params, cookies=cookies, headers=headers)
return response
def get_content_2(get_content_1_url):
headers = {
'authority': 'weibo.com',
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'content-type': 'multipart/form-data; boundary=----WebKitFormBoundaryNs1Toe4Mbr8n1qXm',
'origin': 'https://weibo.com',
'referer': 'https://weibo.com/1762257041/NiSAxfmbZ',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_',
}
s = '{"name":"https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN","entryType":"resource","startTime":20639.80000001192,"duration":563,"initiatorType":"xmlhttprequest","nextHopProtocol":"h2","renderBlockingStatus":"non-blocking","workerStart":0,"redirectStart":0,"redirectEnd":0,"fetchStart":20639.80000001192,"domainLookupStart":20639.80000001192,"domainLookupEnd":20639.80000001192,"connectStart":20639.80000001192,"secureConnectionStart":20639.80000001192,"connectEnd":20639.80000001192,"requestStart":20641.600000023842,"responseStart":21198.600000023842,"firstInterimResponseStart":0,"responseEnd":21202.80000001192,"transferSize":7374,"encodedBodySize":7074,"decodedBodySize":42581,"responseStatus":200,"serverTiming":[],"dns":0,"tcp":0,"ttfb":557,"pathname":"https://weibo.com/ajax/statuses/buildComments","speed":0}'
s = json.loads(s)
s['name'] = get_content_1_url
s = json.dumps(s)
data = f'------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="entry"\r\n\r\n{s}\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="request_id"\r\n\r\n\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm--\r\n'
response = requests.post('https://weibo.com/ajax/log/rum', cookies=cookies, headers=headers, data=data)
return response.text
def get_once_data(uid, mid, the_first=True, max_id=None):
respones_1 = get_content_1(uid, mid, the_first, max_id)
url = respones_1.url
response_2 = get_content_2(url)
df = pd.DataFrame(respones_1.json()['data'])
max_id = respones_1.json()['max_id']
return max_id, df
if __name__ == '__main__':
# 先在上面設置cookies
# 設置好了再進行操作
# 自定義
name = '#鄒振東誠邀張雪峰來廈門請你吃沙茶面#'
uid = '2610806555'
mid = '4914095331742409'
page = 100
# 初始化
df_list = []
max_id = ''
for i in range(page):
if i == 0:
max_id, df = get_once_data(uid=uid, mid=mid)
else:
max_id, df = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id)
if df.shape[0] == 0 or max_id == 0:
break
else:
df_list.append(df)
print(f'第{i}頁解析完畢!max_id:{max_id}')
df = pd.concat(df_list).astype(str).drop_duplicates()
df.to_csv(f'{name}.csv', index=False)
二級評論內容
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import json
page_num = 0
cookies = {
'SINAGLOBAL': '1278126679099.0298.1694199077980',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KMhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt',
'XSRF-TOKEN': '47NC7wE7TMhcqfh1K-4bacK-',
'ALF': '1697384140',
'SSOLoginState': '1694792141',
'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7IJXuI95RLbWORIsozuK4Ohxs_boeOIedEcczDT3uSAI.',
'SUB': '_2A25IAAmdDeRhGeFO61AY8i_NwzyIHXVrdHxVrDV8PUNbmtAGLU74kW9NQYCXlmPtQ1DG4kl_wLzqQqkPl_Do1sZu',
'_s_tentry': 'weibo.com',
'Apache': '3760261250067.669.1694792155706',
'ULV': '1694792155740:8:8:4:3760261250067.669.1694792155706:1694767801057',
'WBPSESS': 'X5DJqu8gKpwqYSp80b4XokKvi4u4_oikBqVmvlBCHvGwXMxtKAFxIPg-LIF7foS715Sa4NttSYqzj5x2Ms5ynKVOM5I_Fsy9GECAYh38R4DQ-gq7M5XOe4y1gOUqvm1hOK60dUKvrA5hLuONCL2ing==',
}
def get_content_1(uid, mid, the_first=True, max_id=None):
headers = {
'authority': 'weibo.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'client-version': 'v2.43.32',
'referer': 'https://weibo.com/1887344341/NhAosFSL4',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'server-version': 'v2023.09.14.1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': '-UX-uyKz0jmzbTnlkyDEMvSO',
}
params = {
'is_reload': '1',
'id': f'{mid}',
'is_show_bulletin': '2',
'is_mix': '1',
'fetch_level': '1',
'max_id': '0',
'count': '20',
'uid': f'{uid}',
'locale': 'zh-CN',
}
if not the_first:
params['flow'] = 0
params['max_id'] = max_id
else:
pass
response = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params, cookies=cookies, headers=headers)
return response
def get_content_2(get_content_1_url):
headers = {
'authority': 'weibo.com',
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'content-type': 'multipart/form-data; boundary=----WebKitFormBoundaryNs1Toe4Mbr8n1qXm',
'origin': 'https://weibo.com',
'referer': 'https://weibo.com/1762257041/NiSAxfmbZ',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_',
}
s = '{"name":"https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN","entryType":"resource","startTime":20639.80000001192,"duration":563,"initiatorType":"xmlhttprequest","nextHopProtocol":"h2","renderBlockingStatus":"non-blocking","workerStart":0,"redirectStart":0,"redirectEnd":0,"fetchStart":20639.80000001192,"domainLookupStart":20639.80000001192,"domainLookupEnd":20639.80000001192,"connectStart":20639.80000001192,"secureConnectionStart":20639.80000001192,"connectEnd":20639.80000001192,"requestStart":20641.600000023842,"responseStart":21198.600000023842,"firstInterimResponseStart":0,"responseEnd":21202.80000001192,"transferSize":7374,"encodedBodySize":7074,"decodedBodySize":42581,"responseStatus":200,"serverTiming":[],"dns":0,"tcp":0,"ttfb":557,"pathname":"https://weibo.com/ajax/statuses/buildComments","speed":0}'
s = json.loads(s)
s['name'] = get_content_1_url
s = json.dumps(s)
data = f'------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="entry"\r\n\r\n{s}\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="request_id"\r\n\r\n\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm--\r\n'
response = requests.post('https://weibo.com/ajax/log/rum', cookies=cookies, headers=headers, data=data)
return response.text
def get_once_data(uid, mid, the_first=True, max_id=None):
respones_1 = get_content_1(uid, mid, the_first, max_id)
url = respones_1.url
response_2 = get_content_2(url)
df = pd.DataFrame(respones_1.json()['data'])
max_id = respones_1.json()['max_id']
return max_id, df
if __name__ == '__main__':
# 更新cookies
# 得到的一級評論信息
df = pd.read_csv('#鄒振東誠邀張雪峰來廈門請你吃沙茶面#.csv')
# 過濾沒有二級評論的一級評論
df = df[df['floor_number']>0]
os.makedirs('./二級評論數(shù)據(jù)/', exist_ok=True)
for i in range(df.shape[0]):
uid = df.iloc[i]['analysis_extra'].replace('|mid:',':').split(':')[1]
mid = df.iloc[i]['mid']
page = 100
if not os.path.exists(f'./二級評論數(shù)據(jù)/{mid}-{uid}.csv'):
print(f'不存在 ./二級評論數(shù)據(jù)/{mid}-{uid}.csv')
df_list = []
max_id_set = set()
max_id = ''
for j in range(page):
if max_id in max_id_set:
break
else:
max_id_set.add(max_id)
if j == 0:
max_id, df_ = get_once_data(uid=uid, mid=mid)
else:
max_id, df_ = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id)
if df_.shape[0] == 0 or max_id == 0:
break
else:
df_list.append(df_)
print(f'{mid}第{j}頁解析完畢!max_id:{max_id}')
if df_list:
outdf = pd.concat(df_list).astype(str).drop_duplicates()
print(f'文件長度為{outdf.shape[0]},文件保存為 ./二級評論數(shù)據(jù)/{mid}-{uid}.csv')
outdf.to_csv(f'./二級評論數(shù)據(jù)/{mid}-{uid}.csv', index=False)
else:
pass
else:
print(f'存在 ./二級評論數(shù)據(jù)/{mid}-{uid}.csv')
微博主體內容獲取流程
以華為發(fā)布會這一熱搜為例子,我們可以通過開發(fā)者模式得到信息基本都包含在下面的 div tag中
我們通過網(wǎng)絡這一模塊進行解析,發(fā)現(xiàn)信息基本都存儲在 %23 開頭的請求之中,接下來分析一下響應內容
這里可以看出響應內容為 html 格式,因此我們可以用xpath或者css來進行解析,這里我們使用BeautifulSoup來解析,解析代碼如下:
soup = BeautifulSoup(response.text, 'lxml')
divs = soup.select('div[action-type="feed_list_item"]')
lst = []
for div in divs:
mid = div.get('mid')
uid = div.select('div.card-feed > div.avator > a')
if uid:
uid = uid[0].get('href').replace('.com/', '?').split('?')[1]
else:
uid = None
time = div.select('div.card-feed > div.content > div.from > a:first-of-type')
if time:
time = time[0].string.strip()
else:
time = None
p = div.select('div.card-feed > div.content > p:last-of-type')
if p:
p = p[0].strings
content = '\n'.join([para.replace('\u200b', '').strip() for para in list(p)]).strip()
else:
content = None
star = div.select('ul > li > a > button > span.woo-like-count')
if star:
star = list(star[0].strings)[0]
else:
star = None
lst.append((mid, uid, content, star, time))
pd.DataFrame(lst, columns=['mid', 'uid', 'content', 'star', 'time'])
我們可以獲得如下結果:
這里的 mid , uid 兩個參數(shù)是為了下一節(jié)獲取微博評論內容需要用到的參數(shù),這里不多解釋,如果不需要刪除就好,接下來我們看一下請求內容。在開始之前,為了對請求解析方便,在這里我們點擊一下 查看全部搜索結果
可以發(fā)現(xiàn)一個以 weibo 開頭的新的請求,和 %23 開頭的請求內容類似,但是帶了參數(shù) q 和nodup ,再翻頁之后我們可以得到 page 這一個參數(shù)
我的解析如下:
1. q:話題
2. nudup:是否展示完整內容
3. page:頁碼
然后可以對這個請求進行模擬,寫入 python 代碼中,結合之前的解析,發(fā)現(xiàn)內容獲取 成功!
完整代碼如下:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import json
# 設置為自己的cookies
cookies = {
'SINAGLOBAL': '1278126679099.0298.1694199077980',
'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.',
'SUB': '_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt',
'ALF': '1733137172',
'_s_tentry': 'weibo.com',
'Apache': '435019984104.0236.1701606621998',
'ULV': '1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048',
}
def get_the_list_response(q='話題', n='1', p='頁碼'):
headers = {
'authority': 's.weibo.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'referer': 'https://s.weibo.com/weibo?q=%23%E6%96%B0%E9%97%BB%E5%AD%A6%E6%95%99%E6%8E%88%E6%80%92%E6%80%BC%E5%BC%A0%E9%9B%AA%E5%B3%B0%23&nodup=1',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
}
params = {
'q': q,
'nodup': n,
'page': p,
}
response = requests.get('https://s.weibo.com/weibo', params=params, cookies=cookies, headers=headers)
return response
def parse_the_list(text):
soup = BeautifulSoup(text)
divs = soup.select('div[action-type="feed_list_item"]')
lst = []
for div in divs:
mid = div.get('mid')
time = div.select('div.card-feed > div.content > div.from > a:first-of-type')
if time:
time = time[0].string.strip()
else:
time = None
p = div.select('div.card-feed > div.content > p:last-of-type')
if p:
p = p[0].strings
content = '\n'.join([para.replace('\u200b', '').strip() for para in list(p)]).strip()
else:
content = None
star = div.select('ul > li > a > button > span.woo-like-count')
if star:
star = list(star[0].strings)[0]
else:
star = None
lst.append((mid, content, star, time))
df = pd.DataFrame(lst, columns=['mid', 'content', 'star', 'time'])
return df
def get_the_list(q, p):
df_list = []
for i in range(1, p+1):
response = get_the_list_response(q=q, p=i)
if response.status_code == 200:
df = parse_the_list(response.text)
df_list.append(df)
print(f'第{i}頁解析成功!', flush=True)
return df_list
if __name__ == '__main__':
# 先設置cookie,換成自己的;
q = '#華為發(fā)布會#'
p = 20
df_list = get_the_list(q, p)
df = pd.concat(df_list)
df.to_csv(f'{q}.csv', index=False)
微博評論內容獲取流程
一級評論內容
上一節(jié)內容獲取了微博主題內容,可以發(fā)現(xiàn)并沒有什么難點,本來我以為都結束了,隊長偏要評論內容,無奈我只好繼續(xù)解析評論內容,接下來我們來獲取微博評論內容,有一點點繞。
首先我們點開評論數(shù)較多的微博, 然后點擊 后面還有552條評論,點擊查看
看到 < div class=“vue-recycle-scroller__item-wrapper” > 這個內容是我們想要的
和上一節(jié)一樣來查找請求, 發(fā)現(xiàn) buildComments?is_reload=1&id= 這個請求包含了我們想要的信息,而且預覽內容為 json 格式,省去了解析 html 的步驟,接下來只需要解析請求就ok了。
話不多說,往下滑動,多獲得幾個請求,對得到的請求,分析如下:
每次往下滑動都會出現(xiàn)兩個請求,一個是 buildComments?flow=0&is_reload=1&id=49451497063731… ,一個是 rum 。同時 buildComments?flow=0&is_reload=1&id=49451497063731… 請求的參數(shù)發(fā)生了變化,第一次請求里面沒有 flow 和 max_id 這兩個參數(shù),經(jīng)過我一下午分析可以得到以下結果:
1. flow:判斷是否第一次請求,第一次請求不能加
2. id:微博主體內容的id 上一節(jié)獲取的mid
3. count:評論數(shù)
4. uid:微博主體內容的用戶id 上一節(jié)獲取的uid
5. max_id:上一次請求后最后一個評論的mid,第一次請求不能加
6. 其他參數(shù)保持不變
7. rum在buildComments之后驗證請求是否人為發(fā)出,反爬機制
8. rum的參數(shù)圍繞buildComments展開
9. rum構造完全湊巧,部分參數(shù)對結果無效,能用就行!
完整代碼如下:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import json
# 設置為自己的cookies
cookies = {
'SINAGLOBAL': '1278126679099.0298.1694199077980',
'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7k7YlpnahLGVhB90-mk0xFNznyCVsjyu9-7-Hk0jRULM.',
'SUB': '_2A25IaC_CDeRhGeFO61AY8i_NwzyIHXVrBC0KrDV8PUNbmtAGLVLckW9NQYCXlpjzhYwtC8sDM7giaMcMNIlWSlP6',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KzhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt',
'ALF': '1733137172',
'_s_tentry': 'weibo.com',
'Apache': '435019984104.0236.1701606621998',
'ULV': '1701606622040:13:2:2:435019984104.0236.1701606621998:1701601199048',
}
# 開始頁碼,不用修改
page_num = 0
def get_content_1(uid, mid, the_first=True, max_id=None):
headers = {
'authority': 'weibo.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'client-version': 'v2.43.30',
'referer': 'https://weibo.com/1762257041/NiSAxfmbZ',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'server-version': 'v2023.09.08.4',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_',
}
params = {
'is_reload': '1',
'id': f'{mid}',
'is_show_bulletin': '2',
'is_mix': '0',
'count': '20',
'uid': f'{uid}',
'fetch_level': '0',
'locale': 'zh-CN',
}
if not the_first:
params['flow'] = 0
params['max_id'] = max_id
else:
pass
response = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params, cookies=cookies, headers=headers)
return response
def get_content_2(get_content_1_url):
headers = {
'authority': 'weibo.com',
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'content-type': 'multipart/form-data; boundary=----WebKitFormBoundaryNs1Toe4Mbr8n1qXm',
'origin': 'https://weibo.com',
'referer': 'https://weibo.com/1762257041/NiSAxfmbZ',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_',
}
s = '{"name":"https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN","entryType":"resource","startTime":20639.80000001192,"duration":563,"initiatorType":"xmlhttprequest","nextHopProtocol":"h2","renderBlockingStatus":"non-blocking","workerStart":0,"redirectStart":0,"redirectEnd":0,"fetchStart":20639.80000001192,"domainLookupStart":20639.80000001192,"domainLookupEnd":20639.80000001192,"connectStart":20639.80000001192,"secureConnectionStart":20639.80000001192,"connectEnd":20639.80000001192,"requestStart":20641.600000023842,"responseStart":21198.600000023842,"firstInterimResponseStart":0,"responseEnd":21202.80000001192,"transferSize":7374,"encodedBodySize":7074,"decodedBodySize":42581,"responseStatus":200,"serverTiming":[],"dns":0,"tcp":0,"ttfb":557,"pathname":"https://weibo.com/ajax/statuses/buildComments","speed":0}'
s = json.loads(s)
s['name'] = get_content_1_url
s = json.dumps(s)
data = f'------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="entry"\r\n\r\n{s}\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="request_id"\r\n\r\n\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm--\r\n'
response = requests.post('https://weibo.com/ajax/log/rum', cookies=cookies, headers=headers, data=data)
return response.text
def get_once_data(uid, mid, the_first=True, max_id=None):
respones_1 = get_content_1(uid, mid, the_first, max_id)
url = respones_1.url
response_2 = get_content_2(url)
df = pd.DataFrame(respones_1.json()['data'])
max_id = respones_1.json()['max_id']
return max_id, df
if __name__ == '__main__':
# 先在上面設置cookies
# 設置好了再進行操作
# 自定義
name = '#鄒振東誠邀張雪峰來廈門請你吃沙茶面#'
uid = '2610806555'
mid = '4914095331742409'
page = 100
# 初始化
df_list = []
max_id = ''
for i in range(page):
if i == 0:
max_id, df = get_once_data(uid=uid, mid=mid)
else:
max_id, df = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id)
if df.shape[0] == 0 or max_id == 0:
break
else:
df_list.append(df)
print(f'第{i}頁解析完畢!max_id:{max_id}')
df = pd.concat(df_list).astype(str).drop_duplicates()
df.to_csv(f'{name}.csv', index=False)
結束!
二級評論內容
二級評論的流程和一級評論一樣,不同的是參數(shù)
一級評論的參數(shù)
params = {
'is_reload': '1',
'id': f'{mid}',
'is_show_bulletin': '2',
'is_mix': '0',
'count': '20',
'uid': f'{uid}',
'fetch_level': '0',
'locale': 'zh-CN',
}
二級評論的參數(shù)
params = {
'is_reload': '1',
'id': f'{mid}',
'is_show_bulletin': '2',
'is_mix': '1',
'fetch_level': '1',
'max_id': '0',
'count': '20',
'uid': f'{uid}',
'locale': 'zh-CN',
}
二級評論參數(shù)的uid指的是微博主體內容的作者uid,而mid指的是評論者的mid
完整代碼如下:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import json
page_num = 0
cookies = {
'SINAGLOBAL': '1278126679099.0298.1694199077980',
'SUBP': '0033WrSXqPxfM725Ws9jqgMF55529P9D9W5mzQcPEhHvorRG-l7.BSsy5JpX5KMhUgL.FoM7ehz4eo2p1h52dJLoI0qLxK-LBKBLBKMLxKnL1--L1heLxKnL1-qLBo.LxK-L1KeL1KzLxK-L1KeL1KzLxK-L1KeL1Kzt',
'XSRF-TOKEN': '47NC7wE7TMhcqfh1K-4bacK-',
'ALF': '1697384140',
'SSOLoginState': '1694792141',
'SCF': 'ApDYB6ZQHU_wHU8ItPHSso29Xu0ZRSkOOiFTBeXETNm7IJXuI95RLbWORIsozuK4Ohxs_boeOIedEcczDT3uSAI.',
'SUB': '_2A25IAAmdDeRhGeFO61AY8i_NwzyIHXVrdHxVrDV8PUNbmtAGLU74kW9NQYCXlmPtQ1DG4kl_wLzqQqkPl_Do1sZu',
'_s_tentry': 'weibo.com',
'Apache': '3760261250067.669.1694792155706',
'ULV': '1694792155740:8:8:4:3760261250067.669.1694792155706:1694767801057',
'WBPSESS': 'X5DJqu8gKpwqYSp80b4XokKvi4u4_oikBqVmvlBCHvGwXMxtKAFxIPg-LIF7foS715Sa4NttSYqzj5x2Ms5ynKVOM5I_Fsy9GECAYh38R4DQ-gq7M5XOe4y1gOUqvm1hOK60dUKvrA5hLuONCL2ing==',
}
def get_content_1(uid, mid, the_first=True, max_id=None):
headers = {
'authority': 'weibo.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'client-version': 'v2.43.32',
'referer': 'https://weibo.com/1887344341/NhAosFSL4',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'server-version': 'v2023.09.14.1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-requested-with': 'XMLHttpRequest',
'x-xsrf-token': '-UX-uyKz0jmzbTnlkyDEMvSO',
}
params = {
'is_reload': '1',
'id': f'{mid}',
'is_show_bulletin': '2',
'is_mix': '1',
'fetch_level': '1',
'max_id': '0',
'count': '20',
'uid': f'{uid}',
'locale': 'zh-CN',
}
if not the_first:
params['flow'] = 0
params['max_id'] = max_id
else:
pass
response = requests.get('https://weibo.com/ajax/statuses/buildComments', params=params, cookies=cookies, headers=headers)
return response
def get_content_2(get_content_1_url):
headers = {
'authority': 'weibo.com',
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'content-type': 'multipart/form-data; boundary=----WebKitFormBoundaryNs1Toe4Mbr8n1qXm',
'origin': 'https://weibo.com',
'referer': 'https://weibo.com/1762257041/NiSAxfmbZ',
'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
'x-xsrf-token': 'F2EEQZrINBfzB2HPPxqTMQJ_',
}
s = '{"name":"https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4944997453660231&is_show_bulletin=2&is_mix=0&max_id=139282732792325&count=20&uid=1762257041&fetch_level=0&locale=zh-CN","entryType":"resource","startTime":20639.80000001192,"duration":563,"initiatorType":"xmlhttprequest","nextHopProtocol":"h2","renderBlockingStatus":"non-blocking","workerStart":0,"redirectStart":0,"redirectEnd":0,"fetchStart":20639.80000001192,"domainLookupStart":20639.80000001192,"domainLookupEnd":20639.80000001192,"connectStart":20639.80000001192,"secureConnectionStart":20639.80000001192,"connectEnd":20639.80000001192,"requestStart":20641.600000023842,"responseStart":21198.600000023842,"firstInterimResponseStart":0,"responseEnd":21202.80000001192,"transferSize":7374,"encodedBodySize":7074,"decodedBodySize":42581,"responseStatus":200,"serverTiming":[],"dns":0,"tcp":0,"ttfb":557,"pathname":"https://weibo.com/ajax/statuses/buildComments","speed":0}'
s = json.loads(s)
s['name'] = get_content_1_url
s = json.dumps(s)
data = f'------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="entry"\r\n\r\n{s}\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm\r\nContent-Disposition: form-data; name="request_id"\r\n\r\n\r\n------WebKitFormBoundaryNs1Toe4Mbr8n1qXm--\r\n'
response = requests.post('https://weibo.com/ajax/log/rum', cookies=cookies, headers=headers, data=data)
return response.text
def get_once_data(uid, mid, the_first=True, max_id=None):
respones_1 = get_content_1(uid, mid, the_first, max_id)
url = respones_1.url
response_2 = get_content_2(url)
df = pd.DataFrame(respones_1.json()['data'])
max_id = respones_1.json()['max_id']
return max_id, df
if __name__ == '__main__':
# 更新cookies
# 得到的一級評論信息
df = pd.read_csv('#鄒振東誠邀張雪峰來廈門請你吃沙茶面#.csv')
# 過濾沒有二級評論的一級評論
df = df[df['floor_number']>0]
os.makedirs('./二級評論數(shù)據(jù)/', exist_ok=True)
for i in range(df.shape[0]):
uid = df.iloc[i]['analysis_extra'].replace('|mid:',':').split(':')[1]
mid = df.iloc[i]['mid']
page = 100
if not os.path.exists(f'./二級評論數(shù)據(jù)/{mid}-{uid}.csv'):
print(f'不存在 ./二級評論數(shù)據(jù)/{mid}-{uid}.csv')
df_list = []
max_id_set = set()
max_id = ''
for j in range(page):
if max_id in max_id_set:
break
else:
max_id_set.add(max_id)
if j == 0:
max_id, df_ = get_once_data(uid=uid, mid=mid)
else:
max_id, df_ = get_once_data(uid=uid, mid=mid, the_first=False, max_id=max_id)
if df_.shape[0] == 0 or max_id == 0:
break
else:
df_list.append(df_)
print(f'{mid}第{j}頁解析完畢!max_id:{max_id}')
if df_list:
outdf = pd.concat(df_list).astype(str).drop_duplicates()
print(f'文件長度為{outdf.shape[0]},文件保存為 ./二級評論數(shù)據(jù)/{mid}-{uid}.csv')
outdf.to_csv(f'./二級評論數(shù)據(jù)/{mid}-{uid}.csv', index=False)
else:
pass
else:
print(f'存在 ./二級評論數(shù)據(jù)/{mid}-{uid}.csv')
代碼運行結果
完成!文章來源:http://www.zghlxwxcb.cn/news/detail-839795.html
問題匯總
csv文件亂碼
把 df.to_csv(...)
改為 df.to_csv(..., encoding='utf_8_sig')
文章來源地址http://www.zghlxwxcb.cn/news/detail-839795.html
到了這里,關于微博數(shù)據(jù)采集,微博爬蟲,微博網(wǎng)頁解析,完整代碼(主體內容+評論內容)的文章就介紹完了。如果您還想了解更多內容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關文章,希望大家以后多多支持TOY模板網(wǎng)!