簡介
Instagram 是目前最熱門的社交媒體平臺之一,擁有大量優(yōu)質的視頻內容。但是要逐一下載這些視頻往往非常耗時。在這篇文章中,我們將介紹如何使用 Python 編寫一個腳本,來實現 Instagram 視頻的批量下載和信息爬取。
我們使用selenium獲取目標用戶的 HTML 源代碼,并將其保存在本地:
def get_html_source(html_url):
option = webdriver.EdgeOptions()
option.add_experimental_option("detach", True)
# option.add_argument("--headless") # 添加這一行設置 Edge 瀏覽器為無頭模式 不會顯示頁面
# 實例化瀏覽器驅動對象,并將配置瀏覽器選項
driver = webdriver.Edge(options=option)
# 等待元素出現,再執(zhí)行操作
driver.get(html_url)
time.sleep(3)
# ===============模擬操作鼠標滑輪====================
i=1
while True:
# 1. 滾動至頁面底部
last_height = driver.execute_script("return document.body.scrollHeight")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(4)
# 2. 檢查是否已經滾動到底部
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
logger.info(f"Scrolled to page{i}")
i += 1
html_source=driver.page_source
driver.quit()
return html_source
total_html_source = get_h
tml_source(f'https://imn/{username}/')
with open(f'./downloads/{username}/html_source.txt', 'w', encoding='utf-8') as file:
file.write(total_html_source)
然后,我們遍歷每個帖子,提取相關信息并下載對應的圖片或視頻:,注意不同類型的帖子,下載爬取方式不一樣
def downloader(logger,downlod_url,file_dir,file_name):
logger.info(f"====>downloading:{file_name}")
# 發(fā)送 HTTP 請求并下載視頻
response = requests.get(downlod_url, stream=True)
# 檢查請求是否成功
if response.status_code == 200:
# 創(chuàng)建文件目錄
if not os.path.exists("downloads"):
os.makedirs("downloads")
# 獲取文件大小
total_size = int(response.headers.get('content-length', 0))
# 保存視頻文件
#
file_path = os.path.join(file_dir, file_name)
with open(file_path, "wb") as f, tqdm(total=total_size, unit='B', unit_scale=True, unit_divisor=1024, ncols=80, desc=file_name) as pbar:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
logger.info(f"downloaded and saved as {file_path}")
return file_path
else:
logger.info("Failed to download .")
return "err"
def image_set_downloader(logger,id,file_dir,file_name_prx):
logger.info("downloading image set========")
image_set_url="https://imm"+id
html_source=get_html_source(image_set_url)
# # 打開或創(chuàng)建一個文件用于存儲 HTML 源代碼
# with open(file_dir+file_name_prx+".txt", 'w', encoding='utf-8') as file:
# file.write(html_source)
# 4、解析出每一個帖子的下載url downlod_url
download_pattern = r'data-proxy="" data-src="([^"]+)"'
matches = re.findall(download_pattern, html_source)
download_file=[]
# # 輸出匹配到的結果
for i, match in enumerate(matches, start=1):
downlod_url = match.replace("amp;", "")
file_name=file_name_prx+"_"+str(i)+".jpg"
download_file.append(downloader(logger,downlod_url,file_dir,file_name))
desc_pattern = r'<div class="desc">([^"]+)follow'
desc_matches = re.findall(desc_pattern, html_source)
desc=""
for match in desc_matches:
desc=match
logger.info(f"desc:{match}")
return desc,download_file
def image_or_video_downloader(logger,id,file_dir,file_name):
logger.info("downloading image or video========")
image_set_url="https://im"+id
html_source=get_html_source(image_set_url)
# # 打開或創(chuàng)建一個文件用于存儲 HTML 源代碼
# with open(file_dir+file_name+".txt", 'w', encoding='utf-8') as file:
# file.write(html_source)
# 4、解析出每一個帖子的下載url downlod_url
download_pattern = r'href="(https://scontent[^"]+)"'
matches = re.findall(download_pattern, part)
# # 輸出匹配到的結果
download_file=[]
for i, match in enumerate(matches, start=1):
downlod_url = match.replace("amp;", "")
download_file.append(downloader(logger,downlod_url,file_dir,file_name))
# 文件名
desc_pattern = r'<div class="desc">([^"]+)follow'
desc_matches = re.findall(desc_pattern, html_source)
desc=""
for match in desc_matches:
desc=match
logger.info(f"desc:{match}")
return desc,download_file
parts = total_html_source.split('class="item">')
posts_number = len(parts) - 2
logger.info(f"posts number:{posts_number} ")
for post_index, part in enumerate(parts, start=0):
id = ""
post_type = ""
post_time = ""
if post_index == 0 or post_index == len(parts) - 1:
continue
logger.info(f"==================== post {post_index} =====================================")
# 解析出每個帖子的時間和 ID
time_pattern = r'class="time">([^"]+)</div>'
matches = re.findall(time_pattern, part)
for match in matches:
post_time = match
logger.info(f"time:{match}")
id_pattern = r'<a href="([^"]+)">'
id_matches = re.findall(id_pattern, part)
for match in id_matches:
id = match
logger.info(f"id:{id}")
# 根據帖子類型進行下載
if '#ffffff' in part:
post_type = "Image Set"
logger.info("post_type: Image Set")
image_name_pex = "img" + str(post_index)
desc, post_contents = image_set_downloader(logger, id, image_dir, image_name_pex)
elif "video" in part:
post_type = "Video"
logger.info("post_type: Video")
video_name = "video" + str(post_index) + ".mp4"
desc, post_contents = image_or_video_downloader(logger, id, video_dir, video_name)
else:
logger.info("post_type: Image")
post_type = "Image"
img_name = "img" + str(post_index) + ".jpg"
desc, post_contents = image_or_video_downloader(logger, id, image_dir, img_name)
# 將信息寫入 Excel 文件
exceller.write_row((post_index, post_time, post_type, desc, ', '.join(post_contents)))
最后,我們調用上述定義的函數,實現圖片/視頻的下載和 Excel 文件的寫入。
結果展示
文章來源:http://www.zghlxwxcb.cn/news/detail-855809.html
源碼
想要獲取源碼的小伙伴加v:15818739505 ,手把手教你部署使用哦文章來源地址http://www.zghlxwxcb.cn/news/detail-855809.html
到了這里,關于ins視頻批量下載,instagram批量爬取視頻信息【爬蟲實戰(zhàn)課1】的文章就介紹完了。如果您還想了解更多內容,請在右上角搜索TOY模板網以前的文章或繼續(xù)瀏覽下面的相關文章,希望大家以后多多支持TOY模板網!