一. 內容簡介
python下載bilibili視頻合集
二. 軟件環(huán)境
2.1vsCode
2.2Anaconda
version: conda 22.9.0
2.3代碼
鏈接:https://pan.baidu.com/s/1WuXTso_iltLlnrLffi1kYQ?pwd=1234
三.主要流程
3.1 下載單個視頻
感覺現在下載的清晰度不夠,可以在找找,給這塊替換了就行
代碼
import requests
import os
from lxml import etree
import re
def videoDownload1(url_):
# 設置用戶代理,cookie
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"
}
# 發(fā)送請求,得到響應對象
response_ = requests.get(url_, headers=headers_)
str_data = response_.text # 視頻主頁的html代碼,類型是字符串
# 使用xpath解析html代碼,,得到想要的url
html_obj = etree.HTML(str_data) # 轉換格式類型
# 獲取視頻的名稱
res_ = html_obj.xpath('//title/text()')[0]
# 視頻名稱的獲取
title_ = re.findall(r'(.*?)_嗶哩嗶哩', res_)[0]
# 影響視頻合成的特殊字符的處理,目前就遇到過這三個,實際上很有可能不止這三個,遇到了就用同樣的方法處理就好了
title_ = title_.replace('/', '')
title_ = title_.replace(' ', '')
title_ = title_.replace('&', '')
title_ = title_.replace(':', '')
# 使用xpath語法獲取數據,取到數據為列表,索引[0]取值取出里面的字符串,即包含視頻音頻文件的url字符串
url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]
# 純視頻的url
video_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 純音頻的url
audio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 設置跳轉字段的headers
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Referer': url_
}
# 獲取純視頻的數據
response_video = requests.get(video_url, headers=headers_, stream=True)
bytes_video = response_video.content
# 獲取純音頻的數據
response_audio = requests.get(audio_url, headers=headers_, stream=True)
bytes_audio = response_audio.content
# 獲取文件大小, 單位為KB
video_size = int(int(response_video.headers['content-length']) / 1024)
audio_size = int(int(response_audio.headers['content-length']) / 1024)
# 保存純視頻的文件
title_1 = title_ + '!' # 名稱進行修改,避免重名
title_1 = title_1.replace(':', '_')
with open(f'{title_1}.mp4', 'wb') as f:
f.write(bytes_video)
# print(f'{title_1}純視頻文件下載完畢...,大小為:{video_size}KB, {int(video_size/1024)}MB')
with open(f'{title_1}.mp3', 'wb') as f:
f.write(bytes_audio)
# print(f'{title_1}純音頻文件下載完畢...,大小為:{audio_size}KB, {int(audio_size/1024)}MB')
# 利用第三方工具ffmpeg 合成視頻, 需要執(zhí)行終端命令
ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"
# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')
folder_path = f"./video/{title_}" # 替換為你想要創(chuàng)建的文件夾路徑
if not os.path.exists(folder_path):
os.mkdir(folder_path)
# print(f"The folder '{folder_path}' already exists.")
command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{title_}/{title_}.mp4 -loglevel quiet'
os.system(command)
# 顯示合成文件的大小
print(f'{title_} 下載完成')
# 移除純視頻文件,
os.remove(f'{title_1}.mp4')
# 移除純音頻文件,
os.remove(f'{title_1}.mp3')
3.2 下載選集視頻
選集視頻的播放鏈接很好找,就是后面的p=幾啥的,拼一下就可以拿到整個的播放鏈接了
代碼
import requests
import os
from lxml import etree
import re
# 獲取網頁源碼
def getUrls2(url):
# 發(fā)送請求,得到響應對象
# 設置用戶代理,cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"
}
response_ = requests.get(url, headers=headers)
str_data = response_.text # 視頻主頁的html代碼,類型是字符串
# 使用xpath解析html代碼,,得到想要的url
html_obj = etree.HTML(str_data) # 轉換格式類型
urls = []
# 獲取了li的數量,
lis = html_obj.xpath("http://ul[@class='list-box']/li")
question_mark_index = url.find('?')
# 如果找到了 '?',就截取該位置之前的子串
if question_mark_index != -1:
cleaned_url = url[:question_mark_index]
else:
cleaned_url = url
# print(cleaned_url)
# 拼接api
for i in range(1,len(lis)+1):
# print(i)
strs = cleaned_url + "?p=" + str(i)
urls.append(strs)
# print(content)
return urls
import requests
import os
from lxml import etree
import re
def videoDownload3(url_,i,name):
# 設置用戶代理,cookie
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"
}
# 發(fā)送請求,得到響應對象
response_ = requests.get(url_, headers=headers_)
str_data = response_.text # 視頻主頁的html代碼,類型是字符串
# 使用xpath解析html代碼,,得到想要的url
html_obj = etree.HTML(str_data) # 轉換格式類型
# 獲取視頻的名稱
res_ = html_obj.xpath('//title/text()')[0]
# 視頻名稱的獲取
title_ = re.findall(r'(.*?)_嗶哩嗶哩', res_)[0]
fileName = name
# 影響視頻合成的特殊字符的處理,目前就遇到過這三個,實際上很有可能不止這三個,遇到了就用同樣的方法處理就好了
title_ = title_.replace('/', '')
title_ = title_.replace(' ', '')
title_ = title_.replace('&', '')
title_ = title_.replace(':', '')
title_ = title_.replace('-', '')
title_ = title_.replace('—', '')
# 使用xpath語法獲取數據,取到數據為列表,索引[0]取值取出里面的字符串,即包含視頻音頻文件的url字符串
url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]
# 純視頻的url
video_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 純音頻的url
audio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 設置跳轉字段的headers
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Referer': url_
}
# 獲取純視頻的數據
response_video = requests.get(video_url, headers=headers_, stream=True)
bytes_video = response_video.content
# 獲取純音頻的數據
response_audio = requests.get(audio_url, headers=headers_, stream=True)
bytes_audio = response_audio.content
# 獲取文件大小, 單位為KB
video_size = int(int(response_video.headers['content-length']) / 1024)
audio_size = int(int(response_audio.headers['content-length']) / 1024)
# 保存純視頻的文件
title_1 = title_ + '!' # 名稱進行修改,避免重名
title_1 = title_1.replace(':', '')
with open(f'{title_1}.mp4', 'wb') as f:
f.write(bytes_video)
# print(f'{title_1}純視頻文件下載完畢...,大小為:{video_size}KB, {int(video_size/1024)}MB')
with open(f'{title_1}.mp3', 'wb') as f:
f.write(bytes_audio)
# print(f'{title_1}純音頻文件下載完畢...,大小為:{audio_size}KB, {int(audio_size/1024)}MB')
# 利用第三方工具ffmpeg 合成視頻, 需要執(zhí)行終端命令
ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"
# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')
folder_path = f"./video/{fileName}" # 替換為你想要創(chuàng)建的文件夾路徑
if not os.path.exists(folder_path):
os.mkdir(folder_path)
# print(f"The folder '{folder_path}' already exists.")
command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{fileName}/{i}.{title_1}.mp4 -loglevel quiet'
file_path = f"./video/{fileName}/{i}.{title_}.mp4"
if os.path.exists(file_path):
pass
else:
os.system(command)
# 顯示合成文件的大小
print(f'{i}.{title_} 下載完成')
# 移除純視頻文件,
os.remove(f'{title_1}.mp4')
# 移除純音頻文件,
os.remove(f'{title_1}.mp3')
3.3 下載合集視頻
合集的里面數據的訪問api
合集里面的數據,就是從這個里面拿到播放id,給json中的處理拿出來,拼接視頻播放鏈接
代碼
# 獲取網頁源碼
def getUrls3(url):
# 發(fā)送請求,得到響應對象
# 設置用戶代理,cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"
}
# 使用正則表達式提取數字
pattern = r'\d+'
numbers = re.findall(pattern, url)
mid = numbers[0]
season_id = numbers[1]
page_num = 1
url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={page_num}&page_size=30"
response = requests.get(url)
if response.status_code == 200:
json_data = response.json()
# print(json_data["data"]["page"]["total"])
total = int(json_data["data"]["page"]["total"])
page_size = int(json_data["data"]["page"]["page_size"])
page = int(total / page_size) + 1
name = json_data["data"]["meta"]["name"]
# print(total,page)
urls = []
#
for i in range(1,page+1):
# print(i)
url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={i}&page_size=30"
response = requests.get(url)
if response.status_code == 200:
json_data = response.json()
archives = json_data["data"]["archives"]
num = 0
for j in archives:
bvid = archives[num]["bvid"]
videoUrl = f"https://www.bilibili.com/video/{bvid}/"
num = num + 1
urls.append(videoUrl)
return urls,name
import requests
import os
from lxml import etree
import re
def videoDownload2(url_,i):
# 設置用戶代理,cookie
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"
}
# 發(fā)送請求,得到響應對象
response_ = requests.get(url_, headers=headers_)
str_data = response_.text # 視頻主頁的html代碼,類型是字符串
# 使用xpath解析html代碼,,得到想要的url
html_obj = etree.HTML(str_data) # 轉換格式類型
# 獲取視頻的名稱
res_ = html_obj.xpath('//title/text()')[0]
# 視頻名稱的獲取
title_ = re.findall(r'(.*?)_嗶哩嗶哩', res_)[0]
fileName = html_obj.xpath('//h1[@class="video-title"]/text()')[0]
# 影響視頻合成的特殊字符的處理,目前就遇到過這三個,實際上很有可能不止這三個,遇到了就用同樣的方法處理就好了
title_ = title_.replace('/', '')
title_ = title_.replace(' ', '')
title_ = title_.replace('&', '')
title_ = title_.replace(':', '')
# 使用xpath語法獲取數據,取到數據為列表,索引[0]取值取出里面的字符串,即包含視頻音頻文件的url字符串
url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]
# 純視頻的url
video_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 純音頻的url
audio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 設置跳轉字段的headers
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Referer': url_
}
# 獲取純視頻的數據
response_video = requests.get(video_url, headers=headers_, stream=True)
bytes_video = response_video.content
# 獲取純音頻的數據
response_audio = requests.get(audio_url, headers=headers_, stream=True)
bytes_audio = response_audio.content
# 獲取文件大小, 單位為KB
video_size = int(int(response_video.headers['content-length']) / 1024)
audio_size = int(int(response_audio.headers['content-length']) / 1024)
# 保存純視頻的文件
title_1 = title_ + '!' # 名稱進行修改,避免重名
title_1 = title_1.replace(':', '_')
with open(f'{title_1}.mp4', 'wb') as f:
f.write(bytes_video)
# print(f'{title_1}純視頻文件下載完畢...,大小為:{video_size}KB, {int(video_size/1024)}MB')
with open(f'{title_1}.mp3', 'wb') as f:
f.write(bytes_audio)
# print(f'{title_1}純音頻文件下載完畢...,大小為:{audio_size}KB, {int(audio_size/1024)}MB')
# 利用第三方工具ffmpeg 合成視頻, 需要執(zhí)行終端命令
ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"
# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')
folder_path = f"./video/{fileName}" # 替換為你想要創(chuàng)建的文件夾路徑
if not os.path.exists(folder_path):
os.mkdir(folder_path)
# print(f"The folder '{folder_path}' already exists.")
command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{fileName}/{i}.{title_}.mp4 -loglevel quiet'
file_path = f"./video/{fileName}/{i}.{title_}.mp4"
if os.path.exists(file_path):
pass
else:
os.system(command)
# 顯示合成文件的大小
print(f'{i}.{title_} 下載完成')
# 移除純視頻文件,
os.remove(f'{title_1}.mp4')
# 移除純音頻文件,
os.remove(f'{title_1}.mp3')
3.4 多線程
代碼
import concurrent.futures
import requests
# 定義一個下載函數
def download_video(URL):
url, index, name = URL.split(" ", 2)
videoDownload3(url,index,name)
def THREAD(URLS):
# 創(chuàng)建線程池,指定線程數量
max_workers = 10 # 這里設置線程數量,根據需要進行調整
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交下載任務給線程池處理
futures = [executor.submit(download_video, URL) for URL in URLS]
# 等待所有任務完成
for future in concurrent.futures.as_completed(futures):
try:
future.result() # 獲取任務的結果(這里不需要結果)
except Exception as e:
print(f"An error occurred: {e}")
3.5 結果
url_model = "https://space.bilibili.com/471303350/channel/collectiondetail?sid=1278346 3"
value = url_model.split(' ')
url = value[0]
model = value[1]
if model == "1":
videoDownload1(url)
print("下載完成")
if model == "2":
# 接口分析
# 點進去的話接口
# https://www.bilibili.com/video/BV1qW4y1a7fU/?spm_id_from=333.337.search-card.all.click
# 點擊視頻的話就這樣
# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1
# https://www.bilibili.com/video/BV1qW4y1a7fU?p=2&vd_source=de2dcd0f37ff916ec3f8fb83c6366123
# 可以發(fā)現不同的集的接口格式應該是這樣的,p = 幾就是第幾集
# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1
# 查看有多少集
# 一種是視頻選集那塊會寫有多少個
# 獲取源碼
urls = getUrls2(url)
i = 1
for index,url in enumerate(urls):
videoDownload2(url,index)
print("下載完成")
if model == "3":
# 接口分析
# 視頻合計每個視頻接口沒有規(guī)律,然后再播放頁中網頁沒有直接的播放鏈接,所以就用合集頁的鏈接來分析
# 網頁里面的每個鏈接都是動態(tài)加載的,需要訪問json數據獲取,也或者用虛擬瀏覽器那種等頁面加載完成后訪問(這種以后可能會更新,感覺這個有點麻煩),
# 這里是用json數據做的
# https://space.bilibili.com/107762251/channel/collectiondetail?sid=877119
# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=107762251&season_id=877119&sort_reverse=false&page_num=1&page_size=30
# https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285
# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=389199842&season_id=1275285&sort_reverse=false&page_num=1&page_size=30
# 這是兩個接口,前面那個數字是用戶,后面那個數字代表的是合集,下載的接口其實是股東
urls,name = getUrls3(url)
# print(len(urls))
for index,url in enumerate(urls):
# print(url)
videoDownload3(url,index,name)
# print(urls)
# 多線程
# for index,url in enumerate(urls):
# URLS.append(url + " " + str(index) + " " + name)
# THREAD(URLS)
那切里做展示,有些合集下載時候有點bug,還沒找到問題,可以下載,但是保存路徑有點問題,應該是和命令行沖突了,我就不改了
3.6 合集視頻更新
原來會出現部分合集顯示下載成功,但是文件夾里面沒有東西,是因為有些合集名字在命令里面沒辦法執(zhí)行,因為一些特殊符號什么的,所以把合集名字手動指定一下下載就可以了,然后多線程加上去,代碼如下
拿視頻鏈接的
# 獲取網頁源碼
def getUrls3(url):
# 發(fā)送請求,得到響應對象
# 設置用戶代理,cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"
}
# 使用正則表達式提取數字
pattern = r'\d+'
numbers = re.findall(pattern, url)
mid = numbers[0]
season_id = numbers[1]
page_num = 1
url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={page_num}&page_size=30"
response = requests.get(url)
if response.status_code == 200:
json_data = response.json()
# print(json_data["data"]["page"]["total"])
total = int(json_data["data"]["page"]["total"])
page_size = int(json_data["data"]["page"]["page_size"])
page = int(total / page_size) + 1
name = json_data["data"]["meta"]["name"]
# print(total,page)
urls = []
#
for i in range(1,page+1):
# print(i)
url = f"https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid={mid}&season_id={season_id}&sort_reverse=false&page_num={i}&page_size=30"
response = requests.get(url)
if response.status_code == 200:
json_data = response.json()
archives = json_data["data"]["archives"]
num = 0
for j in archives:
bvid = archives[num]["bvid"]
videoUrl = f"https://www.bilibili.com/video/{bvid}/"
num = num + 1
urls.append(videoUrl)
return urls,name
下載視頻的
import requests
import os
from lxml import etree
import re
def videoDownload3(url_,index,name):
# 設置用戶代理,cookie
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Cookie': "buvid3=7014DDC0-BF1E-B121-F5A5-F10753C840B423630infoc; i-wanna-go-back=-1; _uuid=49BF2138-1E10F-D5F5-10898-D8311651B53927883infoc; FEED_LIVE_VERSION=V8; DedeUserID=171300042; DedeUserID__ckMd5=c65bec3211413192; CURRENT_FNVAL=4048; rpdid=|(J|)J~m~llk0J'uYm|)~klRl; header_theme_version=CLOSE; hit-new-style-dyn=1; hit-dyn-v2=1; is-2022-channel=1; fingerprint=fe5c7462625770aa2abce449a7c01fd2; buvid_fp_plain=undefined; b_nut=1691207170; b_ut=5; buvid_fp=fe5c7462625770aa2abce449a7c01fd2; LIVE_BUVID=AUTO4016915564967297; buvid4=1AE73807-AEA0-7078-DA57-7F9FE5C3D6F896987-023080912-A0g5nInZwV3VmJJT68FJxw%3D%3D; home_feed_column=5; SESSDATA=fc1266d3%2C1708653865%2C29c08%2A81-i-T9HQrucvpCVcPwSwXl5LmjTyduIzF9veu0KS9i2IwXK_xkcqlt1XQyxJ3sG-9HMSwLwAAKgA; bili_jct=068bc0a79f3fa7aa1a030e478dbf6d4b; sid=5yvjlnfi; browser_resolution=1920-971; bili_ticket=eyJhbGciOiJFUzM4NCIsImtpZCI6ImVjMDIiLCJ0eXAiOiJKV1QifQ.eyJleHAiOjE2OTMzNjY1MTcsImlhdCI6MTY5MzEwNzMxNywicGx0IjotMX0.I1Yfp8S9UIkU4S0G5vtBJfslPtgY7QLCj1dx9WQpyRmxKpZoA1qB5UYXNW4KBSZFGljMm7F1lbGXSGco7F79JZJ2sZNBvH9QiSVlmipzAJKaucIoFh6s3m1jpqjLp10r; bili_ticket_expires=1693366517; bp_video_offset_171300042=834376858445283367; b_lsid=1021245DB_18A3567E5C2; CURRENT_QUALITY=80; PVID=2"
}
# 發(fā)送請求,得到響應對象
response_ = requests.get(url_, headers=headers_)
str_data = response_.text # 視頻主頁的html代碼,類型是字符串
# 使用xpath解析html代碼,,得到想要的url
html_obj = etree.HTML(str_data) # 轉換格式類型
# 獲取視頻的名稱
res_ = html_obj.xpath('//title/text()')[0]
# 視頻名稱的獲取
title_ = re.findall(r'(.*?)_嗶哩嗶哩', res_)[0]
# 影響視頻合成的特殊字符的處理,目前就遇到過這三個,實際上很有可能不止這三個,遇到了就用同樣的方法處理就好了
title_ = title_.replace('/', '')
title_ = title_.replace(' ', '')
title_ = title_.replace('&', '')
title_ = title_.replace(':', '')
# 使用xpath語法獲取數據,取到數據為列表,索引[0]取值取出里面的字符串,即包含視頻音頻文件的url字符串
url_list_str = html_obj.xpath('//script[contains(text(),"window.__playinfo__")]/text()')[0]
# 純視頻的url
video_url = re.findall(r'"video":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 純音頻的url
audio_url = re.findall(r'"audio":\[{"id":\d+,"baseUrl":"(.*?)"', url_list_str)[0]
# 設置跳轉字段的headers
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Referer': url_
}
# 獲取純視頻的數據
response_video = requests.get(video_url, headers=headers_, stream=True)
bytes_video = response_video.content
# 獲取純音頻的數據
response_audio = requests.get(audio_url, headers=headers_, stream=True)
bytes_audio = response_audio.content
# 獲取文件大小, 單位為KB
video_size = int(int(response_video.headers['content-length']) / 1024)
audio_size = int(int(response_audio.headers['content-length']) / 1024)
# 保存純視頻的文件
title_1 = title_ + '!' # 名稱進行修改,避免重名
title_1 = title_1.replace(':', '_')
with open(f'{title_1}.mp4', 'wb') as f:
f.write(bytes_video)
# print(f'{title_1}純視頻文件下載完畢...,大小為:{video_size}KB, {int(video_size/1024)}MB')
with open(f'{title_1}.mp3', 'wb') as f:
f.write(bytes_audio)
# print(f'{title_1}純音頻文件下載完畢...,大小為:{audio_size}KB, {int(audio_size/1024)}MB')
# 利用第三方工具ffmpeg 合成視頻, 需要執(zhí)行終端命令
ffmpeg_path = r".\ffmpeg\bin\ffmpeg.exe"
# os.system(f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy .\video\{title_}.mp4 -loglevel quiet')
folder_path = f"./video/{name}" # 替換為你想要創(chuàng)建的文件夾路徑
if not os.path.exists(folder_path):
os.mkdir(folder_path)
# print(f"The folder '{folder_path}' already exists.")
command = f'{ffmpeg_path} -i {title_1}.mp3 -i {title_1}.mp4 -c copy ./video/{name}/{index}.{title_}.mp4 -loglevel quiet'
os.system(command)
# 顯示合成文件的大小
print(f'{title_} 下載完成')
# 移除純視頻文件,
os.remove(f'{title_1}.mp4')
# 移除純音頻文件,
os.remove(f'{title_1}.mp3')
多線程
import concurrent.futures
import requests
# 定義一個下載函數
def download_video(URL):
url, index, name = URL.split(" ", 2)
videoDownload3(url,index,name)
def THREAD(URLS):
# 創(chuàng)建線程池,指定線程數量
max_workers = 10 # 這里設置線程數量,根據需要進行調整
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交下載任務給線程池處理
futures = [executor.submit(download_video, URL) for URL in URLS]
# 等待所有任務完成
for future in concurrent.futures.as_completed(futures):
try:
future.result() # 獲取任務的結果(這里不需要結果)
except Exception as e:
print(f"An error occurred: {e}")
執(zhí)行
url_model = "https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285 3"
value = url_model.split(' ')
url = value[0]
model = value[1]
if model == "1":
videoDownload1(url)
print("下載完成")
if model == "2":
# 接口分析
# 點進去的話接口
# https://www.bilibili.com/video/BV1qW4y1a7fU/?spm_id_from=333.337.search-card.all.click
# 點擊視頻的話就這樣
# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1
# https://www.bilibili.com/video/BV1qW4y1a7fU?p=2&vd_source=de2dcd0f37ff916ec3f8fb83c6366123
# 可以發(fā)現不同的集的接口格式應該是這樣的,p = 幾就是第幾集
# https://www.bilibili.com/video/BV1qW4y1a7fU?p=1
# 查看有多少集
# 一種是視頻選集那塊會寫有多少個
# 獲取源碼
urls = getUrls2(url)
i = 1
for index,url in enumerate(urls):
videoDownload2(url,index)
print("下載完成")
if model == "3":
# 接口分析
# 視頻合計每個視頻接口沒有規(guī)律,然后再播放頁中網頁沒有直接的播放鏈接,所以就用合集頁的鏈接來分析
# 網頁里面的每個鏈接都是動態(tài)加載的,需要訪問json數據獲取,也或者用虛擬瀏覽器那種等頁面加載完成后訪問(這種以后可能會更新,感覺這個有點麻煩),
# 這里是用json數據做的
# https://space.bilibili.com/107762251/channel/collectiondetail?sid=877119
# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=107762251&season_id=877119&sort_reverse=false&page_num=1&page_size=30
# https://space.bilibili.com/389199842/channel/collectiondetail?sid=1275285
# https://api.bilibili.com/x/polymer/web-space/seasons_archives_list?mid=389199842&season_id=1275285&sort_reverse=false&page_num=1&page_size=30
# 這是兩個接口,前面那個數字是用戶,后面那個數字代表的是合集,下載的接口其實是股東
urls,name = getUrls3(url)
name = "qml項目"
URLS = []
# print(len(urls))
for index,url in enumerate(urls):
# print(url)
URLS.append(url + " " + str(index+1) + " " + name)
THREAD(URLS)
print("全部下載完成?。?!")
# print(urls)
# for index,url in enumerate(urls):
# URLS.append(url + " " + str(index) + " " + name)
# THREAD(URLS)
文章來源:http://www.zghlxwxcb.cn/news/detail-691193.html
四.參考
http://t.csdn.cn/6Pt7v 想下載B站視頻卻不知如何下手?一文教你爬B站!文章來源地址http://www.zghlxwxcb.cn/news/detail-691193.html
到了這里,關于python爬取bilibili,下載視頻的文章就介紹完了。如果您還想了解更多內容,請在右上角搜索TOY模板網以前的文章或繼續(xù)瀏覽下面的相關文章,希望大家以后多多支持TOY模板網!