国产 无码 综合区,色欲AV无码国产永久播放,无码天堂亚洲国产AV,国产日韩欧美女同一区二区

【爬蟲】根據(jù)關(guān)鍵詞自動搜索并爬取結(jié)果

這篇具有很好參考價值的文章主要介紹了【爬蟲】根據(jù)關(guān)鍵詞自動搜索并爬取結(jié)果。希望對大家有所幫助。如果存在錯誤或未考慮完全的地方,請大家不吝賜教,您也可以點擊"舉報違法"按鈕提交疑問。

根據(jù)關(guān)鍵詞自動搜索并爬取網(wǎng)頁的信息
網(wǎng)頁有兩種情況:可以直接獲取頁數(shù)的和不可以直接獲取頁數(shù)的;
兩種情況可以采取不同的方法:

情況一:先爬取頁數(shù),再爬取每頁的數(shù)據(jù)

# coding=utf-8
import pandas as pd
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import re
import random

option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe"
                          , options=option)
head_url = "部分的頭部URL+key="
keywords_all = []
keywords = keywords_all[410:444]

keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
phone_list = []


def PageNumber(keyword):
    wd = urllib.parse.quote(keyword.encode('gb2312'))
    turn_url = head_url + wd + ';use_cas=0;f=pclist;p=0'
    driver.get(turn_url)
    # print(driver.page_source)
    time.sleep(random.randint(1,3))
    try:
        source = driver.find_element(By.XPATH
                                        ,"http://div[@class='gys']/dl/dt/span").text
        reg = re.findall(r".*有(.*)家", source)
        page_number = int(reg[0])
        print("共有",page_number,"條數(shù)據(jù)")
        return page_number
    except:
        return -1

def GetResult(keyword, page):
    wd = urllib.parse.quote(keyword.encode('gb2312'))
    turn_url = head_url + wd + ';use_cas=0;f=pclist;p=' + str(page)
    print(turn_url)
    try:
        driver.get(turn_url)
        time.sleep(random.randint(2,4))
        list = driver.find_elements(By.XPATH
                                , "http://div[@class='gys']/dl/dd/form")
        for l in list:
            company = l.find_element(By.XPATH, "./table/tbody/tr/td/a").text
            print(company)
            company_name_list.append(company)
            company_url = l.find_element(By.XPATH,"./table/tbody/tr/td/a[1]").get_attribute('href')
            print(company_url)
            company_url_list.append(company_url)
            phone = l.find_element(By.XPATH, "./table/tbody/tr[2]/td[2]").text
            print(phone)
            phone_list.append(phone)
            print(keyword)
            keyword_list.append(keyword)
    except:
        print('get不到頁面')
for i in keywords:
    this_page = 0
    page_number = int((PageNumber(keyword=i))/10)
    if page_number == 0:
        try:
            GetResult(keyword=i, page=0)
        except:
            continue
    elif page_number == -1:
        print(i,'無數(shù)據(jù)')
    else:
        for p in range(0,page_number):
            try:
                GetResult(keyword=i, page=p)
            except:
                continue
data_list = []
for a, b, c, d in zip(keyword_list, company_name_list, company_url_list, phone_list):
    x = {}
    x['keyword'] = a
    x['company_name'] = b
    x['company_url'] = c
    x['phone'] = d
    data_list.append(x)
# print(data_list)
with open(r"###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
    writer = csv.writer(f_c_csv)
    writer.writerow(['keyword' ,'company_name', 'company_url', 'phone'])
    for nl in data_list:
        writer.writerow(nl.values())
print("寫入完成!")

情況二:無法爬取到頁碼數(shù),只能換頁爬取的文章來源地址http://www.zghlxwxcb.cn/news/detail-522755.html

# coding=utf-8
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
import random
import pandas as pd

option = webdriver.ChromeOptions()
option.add_argument("headless")
# option.binary_location = r"...\chrome.exe"
option.add_argument('blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(executable_path=r"...\chromedriver.exe, options=option)
head_url = "部分頭url+keyword="
keywords_all = []
keywords = keywords_all[400:444]

keyword_list = []
product_name_list = []
company_name_list = []
company_url_list = []
mobilephone_list = []
telephone_list = []

def NextPage(keyword, page):
    wd = urllib.parse.quote(keyword.encode('utf-8'))
    if page == 0:
        turn_url = head_url + wd
    else:
        turn_url = head_url + wd + "&p="+ str(page)
    print(turn_url)
    driver.get(turn_url)
    time.sleep(random.randint(1,3))
    list = driver.find_elements(By.XPATH
                                    ,"http://div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
    return len(list)

def GetResult(keyword, page):
    wd = urllib.parse.quote(keyword.encode('utf-8'))
    if page == 0:
        turn_url = head_url + wd
    else:
        turn_url = head_url + wd + "&p=" + str(page)
    driver.get(turn_url)
    time.sleep(random.randint(3,5))
    try:
        list = driver.find_elements(By.XPATH
                                    , "http://div[@class='lc-grid-list']//div[@class='container']//div[@class='grid-body']//div[@class='lc-main']//div[@class='lc-products-wrap']//div[@class='pro-item clearfix ']")
        for l in list:
            product_name = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='intro-box']/div[@class='tt']/a").text
            print(product_name)
            product_name_list.append(product_name)
            try:
                telephone = l.find_element(By.XPATH, "./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[2]").text
                print(telephone)
                telephone_list.append(telephone)
                mobilephone = l.find_element(By.XPATH,
                                           "./div[@class='pro-info']/div[@class='basic-box']/div[@class='info']/dl/dd[3]").text
                print(mobilephone)
                mobilephone_list.append(mobilephone)
            except:
                continue
            company = l.find_element(By.XPATH,
                                       "./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em").text
            print(company)
            company_name_list.append(company)
            for link in l.find_elements(By.XPATH,"./div[@class='pro-info']/div[@class='basic-box']/div[@class='title']/em/a"):
                company_url = link.get_attribute('href')
                print(company_url)
                company_url_list.append(company_url)
            print(keyword)
            keyword_list.append(keyword)
    except:
        print("爬取失敗")

for i in keywords:
    this_page = 0
    while NextPage(keyword = i, page = this_page) > 19:
        GetResult(keyword=i, page=this_page)
        this_page = this_page + 1
    if NextPage(keyword = i, page = this_page) < 20:
        GetResult(keyword=i, page=this_page)

data_list = []
for a, b, c, d, e, f in zip(keyword_list, product_name_list, company_name_list, company_url_list, mobilephone_list, telephone_list):
    x = {}
    x['keyword'] = a
    x['product_name'] = b
    x['company_name'] = c
    x['company_url'] = d
    x['mobilephone'] = e
    x['telephone'] = f
    data_list.append(x)
# print(data_list)

with open("###.csv", 'w', newline='', encoding='UTF-8') as f_c_csv:
    writer = csv.writer(f_c_csv)
    writer.writerow(['keyword', 'product_name','company_name', 'company_url', 'mobilephone', 'telephone'])
    for nl in data_list:
        writer.writerow(nl.values())
print("寫入完成!")

到了這里,關(guān)于【爬蟲】根據(jù)關(guān)鍵詞自動搜索并爬取結(jié)果的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!

本文來自互聯(lián)網(wǎng)用戶投稿,該文觀點僅代表作者本人,不代表本站立場。本站僅提供信息存儲空間服務(wù),不擁有所有權(quán),不承擔(dān)相關(guān)法律責(zé)任。如若轉(zhuǎn)載,請注明出處: 如若內(nèi)容造成侵權(quán)/違法違規(guī)/事實不符,請點擊違法舉報進行投訴反饋,一經(jīng)查實,立即刪除!

領(lǐng)支付寶紅包贊助服務(wù)器費用

相關(guān)文章

覺得文章有用就打賞一下文章作者

支付寶掃一掃打賞

博客贊助

微信掃一掃打賞

請作者喝杯咖啡吧~博客贊助

支付寶掃一掃領(lǐng)取紅包,優(yōu)惠每天領(lǐng)

二維碼1

領(lǐng)取紅包

二維碼2

領(lǐng)紅包