不是最近三X队比较火,还出了电视剧,写了一个能够爬取电影天堂上视频的脚本,分为两部分 第一部分是 get_m3u8.py 用来拿到 m3u8 文件 第二部分是 download_video.py 用来下载所有的分片视频,处理 m3u8 和 enckey文件 最后合并ts视频分片。
第一个文件 # conding:utf8 # author:shidt import os import random import json import time import requests from lxml import etree def get_user_agent(): MY_USER_AGENT = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv: Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko Fedora/ Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv: Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", ] return MY_USER_AGENT def get_proxy(): proxy = [ '', '', '', '', '', '', '', '', '', '', '', '', ] return proxy headers = { 'user-agent': random.choice(get_user_agent()), } proxy = { 'http': random.choice(get_proxy()), } # 先拿到主页面的源码 def get_index_source(url): url = 'https://www.sxcse.com/mov/sdd2023.html' sess = requests.session() response = sess.get(url, headers=headers, proxies=proxy) response.encoding='utf-8' file_name = 'index.html' with open (file_name, mode='w', encoding='utf-8') as f: f.write(response.text) return file_name def get_urls(source_file): tree = etree.HTML(open(source_file, 'r', encoding='utf-8').read()) # 解析出每一集的url,存入一个列表 all_urls_list = tree.xpath('//*[@id="playlist"]/div[2]/div[2]/div/div/ul/li/a/@href') # print(all_urls_list) # 拼接处理成完整的每一集的网址 final_url_list = [] for url in all_urls_list: final_url = 'https://www.sxcse.com' + url final_url_list.insert(0, final_url) return final_url_list def download_m3u8_file(url_list, m3u8_path='m3u8_files', enckey_path='enckey_files'): """ 下载每一集的m3u8文件到m3u8目录 下载每一集的enckey文件到key目录 :param url_list: :return: """ if not os.path.exists(m3u8_path): os.mkdir(m3u8_path) if not os.path.exists(enckey_path): os.mkdir(enckey_path) # 开始解析每一个url的源码,找到m3u8文件url,做拼接以后下载下来 file = 1 for url in url_list: print(f'正在下载第{file}集的m3u8文件和enckey文件...') sess = requests.session() response = sess.get(url, headers=headers, proxies=proxy) response.encoding = 'UTF-8' # print(response.text) tree = etree.HTML(response.text) data = tree.xpath('/html/body/div[2]/div[1]/div/div/div[1]/div/script/text()')[0] # print(data) # 2. 解析出 url 并做拼接 这边我做的比较麻烦,其实使用正则也可以 string = str(data).split(' = ')[1].replace('\\', '').split(';')[0] data_dict = json.loads(string) tmp_url = data_dict.get('url') key = tmp_url.split('%2F')[4] # 拼接最终的url m3u8_url = 'https://1080p.jszyplay.com/play/' + key + '/index.m3u8' enckey_url = 'https://1080p.jszyplay.com/play/' + key + '/enc.key' # 3. 拿到了m3u8 url,去下载并保存 # 下载m3u8文件 m3u8_file_path = os.path.join(m3u8_path, f'第{str(file)}集.m3u8') m3u8_resp = sess.get(m3u8_url, headers=headers) with open(m3u8_file_path, 'wb') as f: f.write(m3u8_resp.content) print(f'第{file}集的m3u8文件下载完成!') # 下载enckey文件 enckey_file_path = os.path.join(enckey_path, f'第{str(file)}集.key') enckey_resp = sess.get(enckey_url, headers=headers) with open(enckey_file_path, 'wb') as f: f.write(enckey_resp.content) print(f'第{file}集的enckey文件下载完成!') time.sleep(random.randint(5,10)) file += 1 if __name__ == '__main__': url = 'https://www.sxcse.com/mov/sdd2023.html' get_index_source(url) urls = get_urls('index.html') download_m3u8_file(urls)
第二个文件 # conding:utf8 # author:shidt import random import os import re import time import requests from concurrent.futures import ThreadPoolExecutor, wait from get_m3u8 import headers from get_m3u8 import proxy def download_one_video(url, i, path): print(url, i, '开始下载') resp = requests.get(url, headers=headers) with open(os.path.join(path, f'{i}.ts'), 'wb') as f: f.write(resp.content) print(url, i, '下载完成') def download_one_episode(m3u8_file, target_path): ''' 下载一级视频中所有ts视频的函数 :return: ''' if not os.path.exists(target_path): os.mkdir(target_path) # 读取 m3u8.text 的内容 with open(m3u8_file, mode='r', encoding='utf-8') as f: data = f.readlines() # 创建线程池 pool = ThreadPoolExecutor(50) tasks = [] i = 0 # 拿到所有的httpsw for line in data: # 提取 ts 的url路径 if line.startswith('#'): continue # 使用strip去除url结尾的换行符 ts_url = line.strip() # 提交到线程池 tasks.append(pool.submit(download_one_video, ts_url, i, target_path)) i += 1 # 集体等待线程对象执行完毕 wait(tasks) def download_all_episodes(m3u8_files_path, name): """ :param path: 为m3u8文件所在的文件夹 :return: """ if not os.path.exists(m3u8_files_path): print('m3u8文件路径不存在...') exit() # 先获取到有哪些文件 files = os.listdir(m3u8_files_path) for file in files: # 每一集的名字 episode_name = file.split('.')[0] # 下载到目标路径为剧集的名字+每一集的名字 target_path = name + f'/{episode_name}' print(f'正在下载{episode_name}') download_one_episode(os.path.join(m3u8_files_path, file), target_path) print(f'已完成下载{episode_name}') time.sleep(random.randint(3, 6)) def do_m3u8_url(name, m3u8_path = 'm3u8_files', enckey_path='enckey_files'): ''' 主要实现从 m3u8文件 和 encky 文件中读取每一集的对应文件 打开以后分别做处理 对于 m3u8文件 打开以后处理好秘钥路径 和 ts 文件路径 保存到 F:\三大队 的每一集文件夹中 对于 enckey 文件 保存到每一集的 key.m3u8 文件夹中 :param name: :param m3u8_path: :param enckey_path: :return: ''' # 对于 m3u8 文件的处理 m3u8_files = os.listdir(m3u8_path) for file in m3u8_files: with open(os.path.join(m3u8_path, file), mode='r', encoding='utf8') as f: data = f.readlines() # 拿到data是一个列表,做判断,处理好数据后再进行写入 dirname = file.split('.')[0] target_m3u8_file = os.path.join(name, dirname, file) # print(target_file) with open(target_m3u8_file, 'w', encoding='utf8') as fw: i = 0 for line in data: if line.startswith('#'): if line.startswith('#EXT-X-KEY'): line = line.replace('URI="enc.key"', 'URI="key.m3u8"') fw.write(line) else: fw.write(line) else: fw.write(f'{name}/{dirname}/{i}.ts\n') i += 1 # 对于enckey文件的处理 enckey_files = os.listdir(enckey_path) for file in enckey_files: dirname = file.split('.')[0] with open(os.path.join(enckey_path, file), mode='r', encoding='utf8') as f: data = f.read() target_key_file = os.path.join(name, dirname, 'key.m3u8') with open(target_key_file, 'w', encoding='utf8') as fw: fw.write(data) def merge(filePath): ''' 进行ts文件合并 解决视频音频不同步的问题 建议使用这种 :param filePath: :return: ''' for file in os.listdir(filePath): file_path = os.path.join(filePath,file) os.chdir(file_path) cmd = f'ffmpeg -i {file}.m3u8 -c copy {file}.mp4' os.system(cmd) if __name__ == '__main__': name = 'F:\三大队' # 先判断剧集名字路径是否存在,不存在创建 if not os.path.exists(name): os.mkdir(name) m3u8_files_path = 'm3u8_files' # 先下载所有的片段 download_all_episodes(m3u8_files_path, name) # 再对 m3u8 文件进行处理 do_m3u8_url(name) # 最后合并视频 merge(name)