起因:頁面展示的內(nèi)容被隱藏了部分,無法獲取完整內(nèi)容
處理方案:
1.利用request模擬接口獲取返參,模擬了請(qǐng)求頭,但操作時(shí)一直無法獲得數(shù)據(jù),報(bào)錯(cuò):org.apache.catalina.connector.ClientAbortException。未深究,大概率是服務(wù)器安全問題
selenium獲取請(qǐng)求頭可參考:https://blog.csdn.net/qq_31042199/article/details/119278315
但是跟selenium庫有沖突好像,不知道是否需要把selenium卸載了再安裝seleniumwire,待后續(xù)有時(shí)間驗(yàn)證一下。
2.通過webdriver提供的API查詢,使用的函數(shù)是Network.getResponseBody
代碼:
import json
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
caps = {
'browserName': 'chrome',
'loggingPrefs': {
'browser': 'ALL',
'driver': 'ALL',
'performance': 'ALL',
},
'goog:chromeOptions': {
'perfLoggingPrefs': {
'enableNetwork': True,
},
'w3c': False,
},
}
driver = webdriver.Chrome(desired_capabilities=caps)
driver.get('https://partner.oceanengine.com/union/media/login/')
# 必須等待一定的時(shí)間,不然會(huì)報(bào)錯(cuò)提示獲取不到日志信息,因?yàn)樾踹兜人姓?qǐng)求結(jié)束才能獲取日志信息
time.sleep(3)
request_log = driver.get_log('performance')
print(request_log)
for i in range(len(request_log)):
message = json.loads(request_log[i]['message'])
message = message['message']['params']
# .get() 方式獲取是了避免字段不存在時(shí)報(bào)錯(cuò)
request = message.get('request')
if(request is None):
continue
url = request.get('url')
if(url == "https://s3.pstatp.com/bytecom/resource/union_web2/media/manifest.json"):
# 得到requestId
print(message['requestId'])
# 通過requestId獲取接口內(nèi)容
content = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': message['requestId']})
print(content)
break
3.以上方法在操作中一直報(bào)錯(cuò)“no resource with given identifier found”,經(jīng)排查,requestId是可以獲取的,后來發(fā)現(xiàn)該接口是Ajax,selenium抓取Ajax接口的數(shù)據(jù)用如下demo(這個(gè)demo中包含了動(dòng)態(tài)url獲取過程):文章來源:http://www.zghlxwxcb.cn/news/detail-512514.html
import os, time, json
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
if __name__ == '__main__':
# 引入chromedriver
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument("--disable-extensions")
# chrome_options.add_argument("--disable-gpu")
prefs = {
'download.default_directory': os.getenv('OS_LOG_PATH')
}
chrome_options.add_experimental_option('prefs', prefs)
# make chrome log requests
capabilities = DesiredCapabilities.CHROME
# caps['goog:loggingPrefs']
capabilities["goog:loggingPrefs"] = {"performance": "ALL"} # newer: goog:loggingPrefs
# capabilities['acceptSslCerts'] = True
#browser = webdriver.Chrome(executable_path=r'/opt/google/chrome/chromedriver', options=chrome_options)
browser = webdriver.Chrome(executable_path=r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe', options=chrome_options,desired_capabilities=capabilities)
browser.implicitly_wait(3)
browser.get("https://#########")
# 輸入賬號(hào)
browser.implicitly_wait(3)
username_inputbox = browser.find_elements(By.XPATH, '//*[@id="app"]/section/div[1]/div/div[1]/div/div[2]/form/div[1]/div/div/div/input')[0]
username_inputbox.click()
browser.implicitly_wait(3)
username_inputbox.clear()
username_inputbox.send_keys("#######")
# 輸入密碼
browser.implicitly_wait(3)
password_inputbox = browser.find_elements(By.XPATH, '//*[@id="app"]/section/div[1]/div/div[1]/div/div[2]/form/div[2]/div/div/div/input')[0]
password_inputbox.click()
browser.implicitly_wait(3)
password_inputbox.clear()
password_inputbox.send_keys("#######")
# 點(diǎn)擊登錄
browser.implicitly_wait(3)
submit_btn = browser.find_elements(By.XPATH, '//*[@id="app"]/section/div[1]/div/div[1]/div/div[2]/form/div[3]/div/button')[0]
submit_btn.click()
# 等待5秒
time.sleep(5)
#-------------------------------------------------
base_url = "https://#########"
browser.get(base_url)
time.sleep(3)
# extract requests from logs
logs_raw = browser.get_log("performance")
logs = [json.loads(lr["message"])["message"] for lr in logs_raw]
def log_filter(log_):
return (
# is an actual response
log_["method"] == "Network.responseReceived"
# and json
and "json" in log_["params"]["response"]["mimeType"]
)
for log in filter(log_filter, logs):
request_id = log["params"]["requestId"]
resp_url = log["params"]["response"]["url"]
print(request_id)
print(f"Caught {resp_url}")
json_str = json.dumps(browser.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id}), indent=4)
# 創(chuàng)建一個(gè)params.json文件
with open(f'{request_id}.json', 'w') as f:
f.write(json_str) # 將json_str寫到文件中
browser.quit()
后續(xù)如遇見更多的獲取接口數(shù)據(jù)的情況,再做補(bǔ)充文章來源地址http://www.zghlxwxcb.cn/news/detail-512514.html
到了這里,關(guān)于利用selenium獲取接口數(shù)據(jù)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!