本帖最后由 blindcat 于 2023-10-29 10:36 编辑
小红书图片视频解析源码
import re import requests from bs4 import BeautifulSoup import json from PIL import Image from io import BytesIO def xsh_video_image(): try: headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36", } # 定义请求的URL url = "http://xhslink.com/PMbpHv" # 请替换你自己的cookie cookie = { 'customerBeakerSessionId': '', 'xsecappid': '', 'x-user-id-creator.xiaohongshu.com': '', 'websectiga': '', 'galaxy_creator_session_id': '', 'webBuild': '3.11.3', 'galaxy.creator.beaker.session.id': '', 'sec_poison_id': '', 'web_session': '', 'gid': '', 'a1': '', 'abRequestId': '', 'access-token-creator.xiaohongshu.com': '', 'cache_feeds': '[]', 'customer-sso-sid': '', 'customerClientId': '', 'unread': '', 'webId': ''} # 发送GET请求,并添加Cookie response = requests.get(url, headers=headers, cookies=cookie) response.raise_for_status() if response.text == '': print("解析失败") return soup = BeautifulSoup(response.text, 'html.parser') script = soup.find('script', string=re.compile('window\.__INITIAL_STATE__')).text newhtmlinfo = script.replace('window.__INITIAL_STATE__=', '').replace('undefined', 'null') # 转换成json数据 result = json.loads(newhtmlinfo, strict=False) note = result.get('note') noteDetailMap = note.get('noteDetailMap') first_key, first_value = next(iter(noteDetailMap.items())) first_note = first_value.get('note') # 标题 title = first_note.get('title') # 如果获取到的视频还是带水印那就是你的cookie获取有问题,推荐使用Cookie-Editor插件来获取 if first_note.get('type') == 'video': print("类型为video") stream = first_note.get('video').get('media').get('stream') h264 = stream.get('h264') # 这是视频地址 video_url = h264[-1].get('masterUrl') print(video_url) else: print("类型是图片") image_list = first_note.get('imageList') image_url_list_webp = [] for image_obj in image_list: info_list = image_obj.get('infoList') for image_url in info_list: if image_url.get('imageScene') == 'CRD_WM_JPG': url = image_url.get('url') # 这个url地址的图片是webp格式的需要自己转换一下 download_and_convert_webp_to_jpg(url, "c:/") except requests.exceptions.HTTPError as err: print(err) return "解析错误" def download_and_convert_webp_to_jpg(url, save_path): """ 这个下载的自己改改吧,我的需求是做成接口的 :param url: 图片网络地址 :param save_path: 图片保存的位置 :return: """ response = requests.get(url) if response.status_code == 200: # 从响应内容创建PIL Image对象 webp_image = Image.open(BytesIO(response.content)) # 转换为JPEG格式 webp_image = webp_image.convert("RGB") # 保存为JPEG文件 webp_image.save(save_path, "JPEG") print(f"成功将WebP图片保存为JPEG格式:{save_path}") else: print(f"无法下载图片,状态码:{response.status_code}")
|