根據(jù)關(guān)鍵詞自動搜索并爬取網(wǎng)頁的信息
網(wǎng)頁有兩種情況:可以直接獲取頁數(shù)的和不可以直接獲取頁數(shù)的;
兩種情況可以采取不同的方法:
情況一:先爬取頁數(shù),再爬取每頁的數(shù)據(jù)文章來源:http://www.zghlxwxcb.cn/news/detail-522755.html
# coding=utf-8
import pandas as pd
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import re
import random
option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe"
, options=option)
head_url = "部分的頭部URL+key="
keywords_all = []
keywords = keywords_all[410:444]
keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
phone_list = []
def PageNumber(keyword):
wd = urllib.parse.quote(keyword.encode('gb2312'))
turn_url = head_url + wd + ';use_cas=0;f=pclist;p=0'
driver.get(turn_url)
# print(driver.page_source)
time.sleep(random.randint(1,3))
try:
source = driver.find_element(By.XPATH
,"http://div[@class='gys']/dl/dt/span").text
reg = re.findall(r".*有(.*)家", source)
page_number = int(reg[0])
print("共有",page_number,"條數(shù)據(jù)")
return page_number
except:
return -1
def GetResult(keyword, page):
wd = urllib.parse.quote(keyword.encode('gb2312'))
turn_url = head_url + wd + ';use_cas=0;f=pclist;p=' + str(page)
print(turn_url)
try:
driver.get(turn_url)
time.sleep(random.randint(2,4))
list = driver.find_elements(By.XPATH
, "http://div[@class='gys']/dl/dd/form")
for l in list:
company = l.find_element(By.XPATH, "./table/tbody/tr/td/a").text
print(company)
company_name_list.append(company)
company_url = l.find_element(By.XPATH,"./table/tbody/tr/td/a[1]").get_attribute('href')
print(company_url)
company_url_list.append(company_url)
phone = l.find_element(By.XPATH, "./table/tbody/tr[2]/td[2]").text
print(phone)
phone_list.append(phone)
print(keyword)
keyword_list.append(keyword)
except:
print('get不到頁面')
for i in keywords:
this_page = 0
page_number = int((PageNumber(keyword=i))/10)
if page_number == 0:
try:
GetResult(keyword=i, page=0)
except:
continue
elif page_number == -1:
print(i,'無數(shù)據(jù)')
else:
for p in range(0,page_number):
try:
GetResult(keyword=i, page=p)
except:
continue
data_list = []
for a, b, c, d in zip(keyword_list, company_name_list, company_url_list, phone_list):
x = {}
x['keyword'] = a
x['company_name'] = b
x['company_url'] = c
x['phone'] = d
data_list.append(x)
# print(data_list)
with open(r"###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
writer = csv.writer(f_c_csv)
writer.writerow(['keyword' ,'company_name', 'company_url', 'phone'])
for nl in data_list:
writer.writerow(nl.values())
print("寫入完成!")
情況二:無法爬取到頁碼數(shù),只能換頁爬取的文章來源地址http://www.zghlxwxcb.cn/news/detail-522755.html
# coding=utf-8
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import random
import pandas as pd
option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe, options=option)
head_url = "部分頭url+keyword="
keywords_all = []
keywords = keywords_all[400:444]
keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
mobilephone_list = []
telephone_list = []
def NextPage(keyword, page):
wd = urllib.parse.quote(keyword.encode('utf-8'))
if page == 0:
turn_url = head_url + wd
else:
turn_url = head_url + wd + "&p="+ str(page)
print(turn_url)
driver.get(turn_url)
time.sleep(random.randint(1,3))
list = driver.find_elements(By.XPATH
,"http://div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
return len(list)
def GetResult(keyword, page):
wd = urllib.parse.quote(keyword.encode('utf-8'))
if page == 0:
turn_url = head_url + wd
else:
turn_url = head_url + wd + "&p=" + str(page)
driver.get(turn_url)
time.sleep(random.randint(3,5))
try:
list = driver.find_elements(By.XPATH
, "http://div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
for l in list:
product_name = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='intro-box']/div[@class='tt']/a").text
print(product_name)
product_name_list.append(product_name)
try:
telephone = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[2]").text
print(telephone)
telephone_list.append(telephone)
mobilephone = l.find_element(By.XPATH,
"./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[3]").text
print(mobilephone)
mobilephone_list.append(mobilephone)
except:
continue
company = l.find_element(By.XPATH,
"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em").text
print(company)
company_name_list.append(company)
for link in l.find_elements(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em/a"):
company_url = link.get_attribute('href')
print(company_url)
company_url_list.append(company_url)
print(keyword)
keyword_list.append(keyword)
except:
print("爬取失敗")
for i in keywords:
this_page = 0
while NextPage(keyword = i, page = this_page) > 19:
GetResult(keyword=i, page=this_page)
this_page = this_page + 1
if NextPage(keyword = i, page = this_page) < 20:
GetResult(keyword=i, page=this_page)
data_list = []
for a, b, c, d, e, f in zip(keyword_list, product_name_list, company_name_list, company_url_list, mobilephone_list, telephone_list):
x = {}
x['keyword'] = a
x['product_name'] = b
x['company_name'] = c
x['company_url'] = d
x['mobilephone'] = e
x['telephone'] = f
data_list.append(x)
# print(data_list)
with open("###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
writer = csv.writer(f_c_csv)
writer.writerow(['keyword', 'product_name','company_name', 'company_url', 'mobilephone', 'telephone'])
for nl in data_list:
writer.writerow(nl.values())
print("寫入完成!")
到了這里,關(guān)于【爬蟲】根據(jù)關(guān)鍵詞自動搜索并爬取結(jié)果的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!