diff --git a/Readme.md b/Readme.md index b3f8b8c..13886ee 100644 --- a/Readme.md +++ b/Readme.md @@ -1,35 +1,218 @@ -# 所需环境 -python=3.10 +# Bilibili 视频下载器 -ffmpeg +一个基于 bilibili-api-python 的 B 站视频下载工具,支持自动发现相关视频并批量下载。 +## 功能特性 -# 所需数据库 +- 🎥 支持 B 站视频下载 +- 🔄 自动发现并下载相关视频 +- 📝 持久化保存已下载视频记录(基于 bvid 去重) +- 🚀 多线程下载,后台自动处理 +- 💾 自动合并音视频流 + +## 环境要求 + +- Python 3.10+ +- ffmpeg +- 网络连接 + +## 安装依赖 ```bash -mongo:可以自建也可以使用 +pip install bilibili-api-python ``` -[免费的云](https://cloud.mongodb.com) -# 使用 +## 安装 ffmpeg + +### Windows + +#### 方法一:使用 winget(推荐) + +```powershell +winget install ffmpeg +``` + +#### 方法二:手动安装 + +1. 访问 [ffmpeg 官网](https://ffmpeg.org/download.html) 或 [BtbN FFmpeg Builds](https://github.com/BtbN/FFmpeg-Builds/releases) +2. 下载 Windows 版本的 ffmpeg(选择 `ffmpeg-master-latest-win64-gpl.zip`) +3. 解压到 `C:\ffmpeg` 目录 +4. 将 `C:\ffmpeg\bin` 添加到系统环境变量 PATH 中: + - 右键「此电脑」→「属性」→「高级系统设置」→「环境变量」 + - 在「系统变量」中找到 Path,点击「编辑」 + - 添加 `C:\ffmpeg\bin` + - 点击「确定」保存 + +#### 方法三:使用 Chocolatey + +```powershell +choco install ffmpeg +``` + +### macOS + +#### 使用国内 Homebrew 镜像安装(推荐) + +由于网络原因,建议使用国内镜像源: + +**1. 配置 Homebrew 国内镜像** -## 先安装依赖 ```bash -pip install -r requerment.txt +# 替换 Homebrew 源 +export HOMEBREW_BREW_GIT_REMOTE="https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/brew.git" +export HOMEBREW_BOTTLE_DOMAIN="https://mirrors.tuna.tsinghua.edu.cn/homebrew-bottles" + +# 重置 Homebrew +git -C "$(brew --repo)" remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/brew.git +git -C "$(brew --repo homebrew/core)" remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/homebrew-core.git ``` -## 配置数据库的地址 +**2. 安装 ffmpeg** + ```bash -MONGO_URI = "mongodb://192.168.28.9:27017/" -``` -## 检查ffmpeg是否下载 -``` -ffmpeg --version -# 未下载需要下载 brew install ffmpeg +brew install ffmpeg ``` -## 将数据写入到urls.txt中 +#### 使用官方 Homebrew 安装 -一行一个链接 +```bash +# 安装 Homebrew(如果未安装) +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" -## 启动脚本 \ No newline at end of file +# 安装 ffmpeg +brew install ffmpeg +``` + +### Linux + +#### Ubuntu/Debian + +```bash +sudo apt update +sudo apt install ffmpeg +``` + +#### CentOS/RHEL/Fedora + +**CentOS/RHEL:** +```bash +sudo yum install epel-release +sudo yum install ffmpeg +``` + +**Fedora:** +```bash +sudo dnf install ffmpeg +``` + +#### Arch Linux + +```bash +sudo pacman -S ffmpeg +``` + +#### 使用 Snap 安装 + +```bash +sudo snap install ffmpeg +``` + +## 使用方法 + +### 1. 运行程序 + +```bash +python spider.py +``` + +### 2. 输入视频链接 + +程序启动后,会显示提示符: + +``` +================================================== +Bilibili Video Downloader +================================================== +Download worker started in background +Enter bilibili video URLs to download +Related videos will be automatically discovered and queued +Type 'quit' to exit +================================================== +input worker started, waiting for URLs... +Enter a bilibili URL (or 'quit' to exit): +> +``` + +### 3. 输入 B 站视频链接 + +支持的链接格式: +- `https://www.bilibili.com/video/BV1xx411c7mD` +- `https://b23.tv/xxxxx` +- 直接输入 bvid:`BV1xx411c7mD` + +示例: +``` +> https://www.bilibili.com/video/BV1xx411c7mD +added to queue: BV1xx411c7mD, queue size: 1 +``` + +### 4. 自动下载 + +- 程序会在后台自动下载队列中的视频 +- 下载完成后会自动发现相关视频并添加到队列 +- 已下载的视频会记录在 `downloaded_bvids.txt` 文件中,避免重复下载 + +### 5. 退出程序 + +``` +> quit +stopping... +``` + +## 文件说明 + +``` +spider/ +├── spider.py # 主程序 +├── downloaded_bvids.txt # 已下载视频记录(自动生成) +├── downloads/ # 下载目录(自动生成) +│ ├── BV1xx411c7mD_视频标题.mp4 +│ └── ... +├── requirements.txt # Python 依赖 +└── README.md # 本文件 +``` + +## 注意事项 + +1. **首次运行**:程序会自动查找 ffmpeg,请确保已正确安装 +2. **下载位置**:视频默认下载到 `downloads/` 目录 +3. **去重机制**:程序基于 bvid 进行去重,已下载的视频不会重复下载 +4. **网络要求**:需要稳定的网络连接,建议使用国内网络环境 +5. **存储空间**:请确保有足够的磁盘空间存储视频 + +## 常见问题 + +### Q: 提示 "ffmpeg not found" + +A: 请按照上述安装教程安装 ffmpeg,并确保添加到系统环境变量中。 + +### Q: 下载速度慢 + +A: 下载速度取决于网络环境和 B 站服务器,建议在网络较好的时段下载。 + +### Q: 如何重新下载已下载的视频 + +A: 删除 `downloaded_bvids.txt` 文件中对应的 bvid 行,或删除整个文件重新开始。 + +### Q: 程序卡住不动 + +A: 可能是网络问题,可以按 `Ctrl+C` 终止程序后重新运行。 + +## 许可证 + +MIT License + +## 致谢 + +- [bilibili-api-python](https://github.com/MoyuScript/bilibili-api-python) - B 站 API 库 +- [FFmpeg](https://ffmpeg.org/) - 音视频处理工具 \ No newline at end of file diff --git a/spider.py b/spider.py index 42f576b..8cf6d8a 100644 --- a/spider.py +++ b/spider.py @@ -3,32 +3,74 @@ import re import subprocess import threading import time +import shutil from pathlib import Path +from collections import deque from bilibili_api import HEADERS, get_client, video # pip install bilibili-api-python -from pymongo import MongoClient, ReturnDocument # pip install pymongo -MONGO_URI = "mongodb://192.168.28.9:27017/" -MONGO_DB_NAME = "bilibiliss" -VIDEO_COLLECTION = "bilibili_video_pool" - BASE_DIR = Path(__file__).resolve().parent -URLS_FILE = BASE_DIR / "urls.txt" OUTPUT_DIR = BASE_DIR / "downloads" -FFMPEG_PATH = "ffmpeg" +DOWNLOADED_BVID_FILE = BASE_DIR / "downloaded_bvids.txt" # 已下载 bvid 记录文件 +FFMPEG_PATH = None # Will be set by find_ffmpeg() POLL_SECONDS = 2 FETCH_RELATED_LIMIT = 20 -STATUS_INIT = "init" -STATUS_FETCHING = "fetching" -STATUS_READY = "ready" -STATUS_DOWNLOADING = "downloading" -STATUS_DONE = "done" -STATUS_ERROR = "error" -client = MongoClient(MONGO_URI) -collection = client[MONGO_DB_NAME][VIDEO_COLLECTION] +def find_ffmpeg(): + """查找 ffmpeg 可执行文件的路径""" + # 首先尝试在系统 PATH 中查找 + ffmpeg_path = shutil.which("ffmpeg") + if ffmpeg_path: + return ffmpeg_path + + # Windows 常见的 ffmpeg 安装位置 + common_paths = [ + r"C:\ffmpeg\bin\ffmpeg.exe", + r"C:\Program Files\ffmpeg\bin\ffmpeg.exe", + r"C:\Program Files (x86)\ffmpeg\bin\ffmpeg.exe", + Path.home() / "ffmpeg" / "bin" / "ffmpeg.exe", + Path.home() / "Downloads" / "ffmpeg" / "bin" / "ffmpeg.exe", + BASE_DIR / "ffmpeg" / "bin" / "ffmpeg.exe", + BASE_DIR / "ffmpeg.exe", + ] + + for path in common_paths: + if Path(path).exists(): + return str(path) + + return None + +# 使用线程安全的队列替代数据库 +video_queue = deque() # 待下载的视频队列 +downloaded_set = set() # 已下载的视频集合,防止重复 +queue_lock = threading.Lock() + + +def load_downloaded_bvids(): + """从文件加载已下载的 bvid 列表""" + global downloaded_set + if DOWNLOADED_BVID_FILE.exists(): + try: + with open(DOWNLOADED_BVID_FILE, "r", encoding="utf-8") as f: + downloaded_set = set(line.strip() for line in f if line.strip()) + print(f"loaded {len(downloaded_set)} downloaded bvids from file") + except Exception as e: + print(f"failed to load downloaded bvids: {e}") + downloaded_set = set() + else: + print("no downloaded bvids file found, starting fresh") + downloaded_set = set() + + +def save_downloaded_bvid(bvid: str): + """将 bvid 追加保存到文件""" + try: + with open(DOWNLOADED_BVID_FILE, "a", encoding="utf-8") as f: + f.write(bvid + "\n") + except Exception as e: + print(f"failed to save bvid {bvid}: {e}") def get_bvid_from_url(url: str) -> str: @@ -56,85 +98,28 @@ def get_output_path(file_name: str) -> Path: return OUTPUT_DIR / file_name -def ensure_indexes(): - collection.create_index("bvid", unique=True) - collection.create_index("status") +def add_to_queue(bvid: str, source_url: str = ""): + """添加视频到下载队列""" + with queue_lock: + if bvid not in downloaded_set and bvid not in [item["bvid"] for item in video_queue]: + video_queue.append({ + "bvid": bvid, + "source_url": source_url or f"https://www.bilibili.com/video/{bvid}", + "added_at": time.time() + }) + print(f"added to queue: {bvid}, queue size: {len(video_queue)}") + return True + else: + print(f"skipped duplicate: {bvid}") + return False -def reset_in_progress_docs(): - fetch_reset = collection.update_many( - {"status": STATUS_FETCHING}, - {"$set": {"status": STATUS_INIT, "updated_at": time.time()}}, - ) - download_reset = collection.update_many( - {"status": STATUS_DOWNLOADING}, - {"$set": {"status": STATUS_READY, "updated_at": time.time()}}, - ) - print( - f"reset in-progress docs: fetching={fetch_reset.modified_count}, " - f"downloading={download_reset.modified_count}" - ) - - -def seed_from_urls_file(): - if not URLS_FILE.exists(): - print(f"seed skipped, file not found: {URLS_FILE}") - return - - inserted = 0 - skipped = 0 - with open(URLS_FILE, "r", encoding="utf-8") as file_obj: - for raw_line in file_obj: - url = raw_line.strip() - if not url: - continue - try: - bvid = get_bvid_from_url(url) - except ValueError: - skipped += 1 - print(f"seed skipped, invalid url: {url}") - continue - - result = collection.update_one( - {"bvid": bvid}, - { - "$setOnInsert": { - "bvid": bvid, - "source_url": url, - "video_url": f"https://www.bilibili.com/video/{bvid}", - "title": "", - "download_file_name": f"{bvid}.mp4", - "status": STATUS_INIT, - "related_fetched": False, - "downloaded": False, - "created_at": time.time(), - } - }, - upsert=True, - ) - if result.upserted_id is not None: - inserted += 1 - else: - skipped += 1 - print(f"seed duplicate skipped: {bvid}") - - print(f"seed complete, inserted={inserted}, skipped={skipped}") - - -def claim_fetch_doc(): - return collection.find_one_and_update( - {"status": STATUS_INIT, "related_fetched": False}, - {"$set": {"status": STATUS_FETCHING, "fetch_started_at": time.time()}}, - return_document=ReturnDocument.AFTER, - ) - - -def claim_download_doc(): - return collection.find_one_and_update( - {"status": STATUS_READY, "downloaded": False}, - {"$set": {"status": STATUS_DOWNLOADING, "download_started_at": time.time()}}, - return_document=ReturnDocument.AFTER, - ) +def get_from_queue(): + """从队列获取一个待下载的视频""" + with queue_lock: + if video_queue: + return video_queue.popleft() + return None async def fetch_video_info_and_related(bvid: str): @@ -145,39 +130,19 @@ async def fetch_video_info_and_related(bvid: str): def save_related_bvids(parent_bvid: str, related_items): + """将相关视频添加到队列""" inserted = 0 skipped = 0 for item in related_items[:FETCH_RELATED_LIMIT]: related_bvid = item.get("bvid") if not related_bvid: continue - - result = collection.update_one( - {"bvid": related_bvid}, - { - "$setOnInsert": { - "bvid": related_bvid, - "source_url": f"https://www.bilibili.com/video/{related_bvid}", - "video_url": f"https://www.bilibili.com/video/{related_bvid}", - "title": item.get("title", ""), - "download_file_name": build_output_file_name( - related_bvid, item.get("title", "") - ), - "status": STATUS_INIT, - "related_fetched": False, - "downloaded": False, - "parent_bvid": parent_bvid, - "created_at": time.time(), - } - }, - upsert=True, - ) - if result.upserted_id is not None: + + if add_to_queue(related_bvid, f"https://www.bilibili.com/video/{related_bvid}"): inserted += 1 - print(f"related inserted: {related_bvid}") else: skipped += 1 - print(f"related duplicate skipped: {related_bvid}") + return inserted, skipped @@ -196,6 +161,8 @@ async def download_stream(url: str, output_path: Path, intro: str): def merge_media(video_path: Path, audio_path: Path, output_path: Path): + if not FFMPEG_PATH: + raise RuntimeError("ffmpeg not found. Please install ffmpeg and add it to PATH.") subprocess.run( [ FFMPEG_PATH, @@ -215,6 +182,8 @@ def merge_media(video_path: Path, audio_path: Path, output_path: Path): def convert_flv_to_mp4(source_path: Path, output_path: Path): + if not FFMPEG_PATH: + raise RuntimeError("ffmpeg not found. Please install ffmpeg and add it to PATH.") subprocess.run( [FFMPEG_PATH, "-y", "-i", str(source_path), str(output_path)], check=True, @@ -248,129 +217,121 @@ async def download_video_file(bvid: str, output_file_name: str): return final_path -def mark_doc_downloaded(doc, final_path: Path): - collection.update_one( - {"_id": doc["_id"]}, - { - "$set": { - "status": STATUS_DONE, - "downloaded": True, - "file_path": str(final_path), - "updated_at": time.time(), - } - }, - ) - - -def process_fetch_doc(doc): - bvid = doc["bvid"] +def process_download(bvid: str): + """处理单个视频的下载""" try: + # 获取视频信息 info, related_items = asyncio.run(fetch_video_info_and_related(bvid)) - title = info.get("title", "") + title = info.get("title", bvid) file_name = build_output_file_name(bvid, title) - inserted, skipped = save_related_bvids(bvid, related_items) - collection.update_one( - {"_id": doc["_id"]}, - { - "$set": { - "title": title, - "aid": info.get("aid"), - "cid": info.get("cid"), - "owner_name": (info.get("owner") or {}).get("name", ""), - "video_url": f"https://www.bilibili.com/video/{bvid}", - "download_file_name": file_name, - "related_fetched": True, - "status": STATUS_READY, - "related_inserted_count": inserted, - "related_skipped_count": skipped, - "updated_at": time.time(), - } - }, - ) - print(f"fetch done: {bvid}, related_inserted={inserted}, related_skipped={skipped}") - except Exception as exc: - collection.update_one( - {"_id": doc["_id"]}, - { - "$set": { - "status": STATUS_ERROR, - "fetch_error": str(exc), - "updated_at": time.time(), - } - }, - ) - print(f"fetch failed: {bvid}, error={exc}") - - -def process_download_doc(doc): - bvid = doc["bvid"] - file_name = doc.get("download_file_name") or f"{bvid}.mp4" - final_path = get_output_path(file_name) - - if doc.get("downloaded") and final_path.exists(): - print(f"download already marked and file exists, skipped: {bvid}") - return - - if final_path.exists(): - mark_doc_downloaded(doc, final_path) - print(f"download file already exists, skipped: {bvid} -> {final_path}") - return - - try: + + print(f"fetching info done: {bvid}, title: {title}") + + # 检查文件是否已存在 + final_path = get_output_path(file_name) + if final_path.exists(): + print(f"file already exists, skipped: {bvid} -> {final_path}") + with queue_lock: + downloaded_set.add(bvid) + return + + # 下载视频 final_path = asyncio.run(download_video_file(bvid, file_name)) - mark_doc_downloaded(doc, final_path) print(f"download done: {bvid} -> {final_path}") + + # 标记为已下载并保存到文件 + with queue_lock: + downloaded_set.add(bvid) + save_downloaded_bvid(bvid) + + # 添加相关视频到队列 + inserted, skipped = save_related_bvids(bvid, related_items) + print(f"related videos: inserted={inserted}, skipped={skipped}") + except Exception as exc: - collection.update_one( - {"_id": doc["_id"]}, - { - "$set": { - "status": STATUS_READY, - "download_error": str(exc), - "updated_at": time.time(), - } - }, - ) print(f"download failed: {bvid}, error={exc}") -def fetch_worker(): - while True: - doc = claim_fetch_doc() - if not doc: - time.sleep(POLL_SECONDS) - continue - process_fetch_doc(doc) - - def download_worker(): + """下载工作线程:从队列中取出视频并下载""" + print("download worker started") while True: - doc = claim_download_doc() - if not doc: + item = get_from_queue() + if not item: time.sleep(POLL_SECONDS) continue - process_download_doc(doc) + + bvid = item["bvid"] + print(f"processing: {bvid}") + process_download(bvid) + + +def input_worker(): + """输入工作线程:接收用户输入的链接""" + print("input worker started, waiting for URLs...") + print("Enter a bilibili URL (or 'quit' to exit):") + + while True: + try: + url = input("> ").strip() + if url.lower() == 'quit': + print("stopping...") + break + + if not url: + continue + + try: + bvid = get_bvid_from_url(url) + add_to_queue(bvid, url) + except ValueError as e: + print(f"invalid url: {e}") + print("please enter a valid bilibili video URL") + + except EOFError: + break + except KeyboardInterrupt: + break def main(): - ensure_indexes() - reset_in_progress_docs() - seed_from_urls_file() - - fetch_thread = threading.Thread(target=fetch_worker, daemon=True, name="fetch-worker") - download_thread = threading.Thread( - target=download_worker, daemon=True, name="download-worker" - ) - fetch_thread.start() + global FFMPEG_PATH + + # 查找 ffmpeg + FFMPEG_PATH = find_ffmpeg() + if not FFMPEG_PATH: + print("=" * 50) + print("ERROR: ffmpeg not found!") + print("=" * 50) + print("Please install ffmpeg and add it to your system PATH.") + print("Download from: https://ffmpeg.org/download.html") + print("Or place ffmpeg.exe in one of these locations:") + print(" - C:\\ffmpeg\\bin\\") + print(" - Current directory") + print("=" * 50) + return + + print(f"ffmpeg found at: {FFMPEG_PATH}") + + # 加载已下载的 bvid 列表 + load_downloaded_bvids() + + # 启动下载工作线程 + download_thread = threading.Thread(target=download_worker, daemon=True, name="download-worker") download_thread.start() - - print("workers started: fetch-worker, download-worker") - try: - while True: - time.sleep(1) - except KeyboardInterrupt: - print("stopping...") + + print("=" * 50) + print("Bilibili Video Downloader") + print("=" * 50) + print("Download worker started in background") + print("Enter bilibili video URLs to download") + print("Related videos will be automatically discovered and queued") + print("Type 'quit' to exit") + print("=" * 50) + + # 主线程处理用户输入 + input_worker() if __name__ == "__main__": - main() + main() \ No newline at end of file