1.數據爬取
數據爬取的內容主要包括30個崗位種類(數據分析,產品經理,產品助理,交互設計,前端開發(fā),軟件設計,IOS開發(fā),業(yè)務分析,安卓開發(fā),PHP開發(fā),業(yè)務咨詢,需求分析,流程設計,售后經理,售前經理,技術支持,ERP實施,實施工程師,IT項目經理,IT項目助理,信息咨詢,數據挖掘,數據運營,網絡營銷,物流與供應鏈,渠道管理,電商運營,客戶關系管理,新媒體運營,產品運營),每一個崗位爬取的信息包括:崗位名稱、公司名稱、公司規(guī)模、工作地點、薪資、工作要求、工作待遇等。
數據爬取代碼展示:
1. import requests
2. from bs4 import BeautifulSoup
3. import pymysql
4. import random
5. from selenium import webdriver
6. from lxml import etree
7. import lxml
8. from selenium.webdriver import ChromeOptions
9. import re
10. import time
11. import requests
12. #定義函數,用于獲取每個 url 的 html
13. def spider(url):
14. headers = {
15. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"}
16. try:
17. rep = requests.get(url, headers=headers)
18. rep.raise_for_status()
19. rep.encoding = rep.apparent_encoding
20. txt = rep.text
21. return txt
22. except:
23. print("解析失敗")
24.
25. #定義 jiexi()函數,用于解析得到的 html
26. def jiexi(html, info,name):
27. soup = BeautifulSoup(html, "lxml")
28. text = soup.find_all("script", type="text/javascript")[3].string
29. #觀察原始代碼發(fā)現我們需要的數據在 engine_jds 后
30. data = eval(str(text).split("=", 1)[1])["engine_jds"]
31. for d in data:
32. try:
33. job_name = d["job_name"].replace("\\", "") # 崗位名稱
34. except:
35. job_name = " "
36. try:
37. company_name = d["company_name"].replace("\\", "") # 公司名稱
38. except:
39. company_name = " "
40. try:
41. providesalary_text = d["providesalary_text"].replace("\\", "") # 薪資
42. except:
43. providesalary_text = " "
44. try:
45. workarea_text = d["workarea_text"].replace("\\", "") #工作地點
46. except:
47. workarea_text = " "
48. try:
49. updatedate = d["updatedate"].replace("\\", "") #更新時間
50. except:
51. updatedate = " "
52. try:
53. jobwelf = d["jobwelf"].replace("\\", "") # 工作待遇
54. except:
55. jobwelf = " "
56. try:
57. companyind_text = d["companyind_text"].replace("\\", "") # 公司類型
58. except:
59. companyind_text = " "
60. try:
61. companysize_text = d["companysize_text"].replace("\\", "") # 公司規(guī)模
62. except:
63. companysize_text = " "
64. try:
65. at = d["attribute_text"] # 工作要求
66. s = ''
67. for i in range(0, len(at)):
68. s = s + at[i] + ','
69. attribute_text = s[:-1]
70. except:
71. attribute_text = " "
72. #將每一條崗位數據爬取下的內容以及傳入參數 name 作為一個列表,依此加入到 info 列表中
73. info.append( [ name,job_name, updatedate, company_name, companyind_text, companysize_text, workarea_text, providesalary_text, attribute_text, jobwelf])
74. #將數據存到 MySQL 中名為“51job”的數據庫中
75. def save(info):
76. #將數據保存到數據庫表中對應的列
77. for data in info :
78. present_job = data[0] # 當前爬取崗位
79. job_name = data[1] #崗位
80. updatedate = data[2] #更新時間
81. company_name = data[3] # 公司名稱
82. companyind_text = data[4] #公司類型
83. companysize_text = data[5] #公司規(guī)模
84. workarea_text = data[6] #工作地點
85. providesalary_text = data[7] #薪資
86. attribute_text = data[8] #工作要求
87. jobwelf = data[9] #工作待遇
88. # 創(chuàng)建 sql 語句
89. sql = "insert into jobs(當前爬取崗位,崗位,更新時間,公司名稱,公司類型,公司規(guī)模,工作地點,薪資,工作要求,工作待遇) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
90. # 執(zhí)行 sql 語句
91. cursor.execute(sql, [present_job, job_name, updatedate, company_name, companyind_text, companysize_text,
92. workarea_text, providesalary_text, attribute_text, jobwelf])
93. db.commit() # 提交數據
94. if __name__ == '__main__': #主函數
95. job=["產品經理","產品助理","交互設計","前端開發(fā)","軟件設計","IOS開發(fā)","業(yè)務分析","安卓開發(fā)","PHP開發(fā)","業(yè)務咨詢","需求分析","流程設計"
96. ,"售后經理","售前經理","技術支持","ERP實施","實施工程師","IT項目經理","IT項目助理","信息咨詢","數據挖掘","數據運營","數據分析","網絡營銷",
97. "物流與供應鏈","渠道管理","電商運營","客戶關系管理","新媒體運營","產品運營"]
98. page_list=['1141', '62', '169', '619', '356', '61', '229', '64', '56', '356', '1379', '147', '62', '29', '2000', '173', '184', '10', '2', '396', '221', '115', '2000', '381', '5', '295', '1233', '280', '699', '352']
99. #https://www.pexels.com/
100. option = ChromeOptions()
101. UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
102. option.add_argument(f'user-agent={UA}')
103. option.add_experimental_option('useAutomationExtension', False)
104. option.add_experimental_option('excludeSwitches', ['enable-automation'])
105. web = webdriver.Chrome(chrome_options=option) # chrome_options=chrome_opt,,options=option
106. web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
107. "source": """
108. Object.defineProperty(navigator, 'webdriver', {
109. get: () => undefined
110. })
111. """
112. })
113. web.implicitly_wait(3)
114. url='https://search.51job.com/list/000000,000000,0000,00,9,99,%E4%BA%A7%E5%93%81%E7%BB%8F%E7%90%86,2,2.html?'
115. web.get(url)
116. time.sleep(6)
117. le=len(job)
118. db = pymysql.connect( # 連接數據庫host="127.0.0.1", #MySQL 服務器名
119. user="root", # 用戶名
120. password="root", # 密碼
121. database="python上機", # 操作的數據庫名稱charset="utf8"
122. )
123. cursor = db.cursor()
124. for j in range(23,le):
125. for i in range(1,int(page_list[j])):#頁面
126. info = []
127. # url = "https://search.51job.com/list/000000,000000,0000,00,9,99," + j + ",2," + str(i) + ".html?"
128. url = "https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html?".format(job[j], i)
129. web.get(url)
130. ht = web.page_source
131. soup = BeautifulSoup(ht, "lxml")
132. jiexi(ht, info,job[j])
133. print('崗位{}:{}/{}'.format(j,i,page_list[j]))
134. time.sleep(1)
135. save(info)
136. time.sleep(1)
137. cursor.close()
138. # 關閉連接
139. db.close()
數據爬取結果:

2.數據清洗
2.1匹配工作崗位
由于崗位爬取過程是以整頁為單位進行的數據爬取,在爬取的最后一頁崗位信息內容中會包含非關鍵詞崗位搜索內容,為了確保爬取崗位信息的準確性,進行崗位匹配的數據清洗過程。文章來源:http://www.zghlxwxcb.cn/news/detail-457789.html
代碼展示:文章來源地址http://www.zghlxwxcb.cn/news/detail-457789.html
到了這里,關于前程無憂崗位數據可視化分析報告的文章就介紹完了。如果您還想了解更多內容,請在右上角搜索TOY模板網以前的文章或繼續(xù)瀏覽下面的相關文章,希望大家以后多多支持TOY模板網!