From 1fdd8e393ed8e480d953f53063cc45b0f1f05268 Mon Sep 17 00:00:00 2001 From: lovewater <1943158197@qq.com> Date: Sun, 29 Mar 2026 01:34:09 +0800 Subject: [PATCH] change download fifter count --- .DS_Store | Bin 0 -> 6148 bytes Readme.md | 9 ++++++ spider.py | 82 ++++++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 77 insertions(+), 14 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e40a6521589c7398cccecb6d01061d80549f47d4 GIT binary patch literal 6148 zcmeHKO>fgM7=FDK>M((N*rZ*MEOD($yCI~CODNl6*9E}=P)SGx5ouhKbYZG0<;;Ez zSAGfqg%dpfnA9&gBEsx7Acs%+Sf9ccrZ zenv);LW-$8%Qm9jfK$LJ@UJPL)~<&YE2*ZOme#L)7iBVn7h41S0qp(MrW4r0i0X1` z8Q-F5mX=k&|DEmZwr<^S^ETh(Z-Ni94yv%4m7}nDrFze$%%bad7`;f7dfeH6Ci5yx z^Q6#(G)@rm`emBOvL4A<9+#RM7zg*bH|`wH=YzpfR~!$Ai>{atkNREl==kws;qik9 z51*WmKW39$e$nqv411NfyB4qE3XLt9T?9p*$^1RMis{igC3=^e0DCl{_$*sii=HBC zS*?3iAo3H$%J8JNm) z`R&kkb|V$A>wxdw$n0IxP64ODpA=C0gFsQ}TWk#Ktpk<50svbW)`qsaODK-8=v!!|5r(_=M-=X{8tLF)=6+OLQ3{*-AGQIwF2cWiZT_~7}OM0_BhrJbrc_>XoF9v X2GF str: match = re.search(r"BV[0-9A-Za-z]{10}", url) if not match: @@ -81,13 +91,14 @@ def get_bvid_from_url(url: str) -> str: def sanitize_title(title: str, max_length: int = 80) -> str: - cleaned = re.sub(r'[\\/:*?"<>|]+', "_", title).strip() - cleaned = re.sub(r"\s+", "_", cleaned) - cleaned = re.sub(r"_+", "_", cleaned) - cleaned = cleaned.strip("._") + # 只保留字母、数字和中文,其他全部移除 + cleaned = re.sub(r'[^\w\u4e00-\u9fff]', '', title) + # 移除连续下划线 + cleaned = re.sub(r'_+', '_', cleaned) + cleaned = cleaned.strip('_') if not cleaned: cleaned = "video" - return cleaned[:max_length].rstrip("._") + return cleaned[:max_length].rstrip('_') def build_output_file_name(bvid: str, title: str) -> str: @@ -217,15 +228,54 @@ async def download_video_file(bvid: str, output_file_name: str): return final_path +def download_with_youget(bvid: str, title: str, output_dir: Path) -> bool: + """使用 you-get 下载视频""" + url = f"https://www.bilibili.com/video/{bvid}" + # 生成自定义文件名:bvid_清理后的标题 + custom_filename = f"{bvid}_{sanitize_title(title)}" + try: + # 使用 you-get 下载视频,指定文件名,禁用字幕 + result = subprocess.run( + ["you-get", "-o", str(output_dir), "-O", custom_filename, "--no-caption", url], + capture_output=True, + text=True, + check=True + ) + print(f"you-get output: {result.stdout}") + return True + except subprocess.CalledProcessError as e: + print(f"you-get download failed: {e.stderr}") + return False + except FileNotFoundError: + print("you-get not found, please install it: pip install you-get") + return False + + def process_download(bvid: str): """处理单个视频的下载""" try: # 获取视频信息 info, related_items = asyncio.run(fetch_video_info_and_related(bvid)) title = info.get("title", bvid) - file_name = build_output_file_name(bvid, title) - print(f"fetching info done: {bvid}, title: {title}") + # 获取观看量 + view_count = info.get("stat", {}).get("view", 0) + + print(f"fetching info done: {bvid}, title: {title}, views: {view_count}") + + # 判断观看量是否超过 50w (500000) + if view_count < 500000: + print(f"skipped: {bvid}, view count {view_count} < 500000") + # 记录跳过的视频信息 + save_skipped_bvid(bvid, view_count, title) + # 仍然添加相关视频到队列 + inserted, skipped = save_related_bvids(bvid, related_items) + print(f"related videos: inserted={inserted}, skipped={skipped}") + return + + print(f"view count {view_count} >= 500000, downloading...") + + file_name = build_output_file_name(bvid, title) # 检查文件是否已存在 final_path = get_output_path(file_name) @@ -235,14 +285,18 @@ def process_download(bvid: str): downloaded_set.add(bvid) return - # 下载视频 - final_path = asyncio.run(download_video_file(bvid, file_name)) - print(f"download done: {bvid} -> {final_path}") + # 使用 you-get 下载视频 + success = download_with_youget(bvid, title, OUTPUT_DIR) - # 标记为已下载并保存到文件 - with queue_lock: - downloaded_set.add(bvid) - save_downloaded_bvid(bvid) + if success: + print(f"download done: {bvid}") + + # 标记为已下载并保存到文件 + with queue_lock: + downloaded_set.add(bvid) + save_downloaded_bvid(bvid) + else: + print(f"download failed: {bvid}") # 添加相关视频到队列 inserted, skipped = save_related_bvids(bvid, related_items)