fix:移除了需要链接数据库的问题

This commit is contained in:
爱喝水的木子
2026-03-28 20:25:20 +08:00
parent ece892e9a9
commit 268bffa7d9
2 changed files with 385 additions and 241 deletions

221
Readme.md
View File

@@ -1,35 +1,218 @@
# 所需环境 # Bilibili 视频下载器
python=3.10
ffmpeg 一个基于 bilibili-api-python 的 B 站视频下载工具,支持自动发现相关视频并批量下载。
## 功能特性
# 所需数据库 - 🎥 支持 B 站视频下载
- 🔄 自动发现并下载相关视频
- 📝 持久化保存已下载视频记录(基于 bvid 去重)
- 🚀 多线程下载,后台自动处理
- 💾 自动合并音视频流
## 环境要求
- Python 3.10+
- ffmpeg
- 网络连接
## 安装依赖
```bash ```bash
mongo:可以自建也可以使用 pip install bilibili-api-python
``` ```
[免费的云](https://cloud.mongodb.com)
# 使用 ## 安装 ffmpeg
### Windows
#### 方法一:使用 winget推荐
```powershell
winget install ffmpeg
```
#### 方法二:手动安装
1. 访问 [ffmpeg 官网](https://ffmpeg.org/download.html) 或 [BtbN FFmpeg Builds](https://github.com/BtbN/FFmpeg-Builds/releases)
2. 下载 Windows 版本的 ffmpeg选择 `ffmpeg-master-latest-win64-gpl.zip`
3. 解压到 `C:\ffmpeg` 目录
4.`C:\ffmpeg\bin` 添加到系统环境变量 PATH 中:
- 右键「此电脑」→「属性」→「高级系统设置」→「环境变量」
- 在「系统变量」中找到 Path点击「编辑」
- 添加 `C:\ffmpeg\bin`
- 点击「确定」保存
#### 方法三:使用 Chocolatey
```powershell
choco install ffmpeg
```
### macOS
#### 使用国内 Homebrew 镜像安装(推荐)
由于网络原因,建议使用国内镜像源:
**1. 配置 Homebrew 国内镜像**
## 先安装依赖
```bash ```bash
pip install -r requerment.txt # 替换 Homebrew 源
export HOMEBREW_BREW_GIT_REMOTE="https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/brew.git"
export HOMEBREW_BOTTLE_DOMAIN="https://mirrors.tuna.tsinghua.edu.cn/homebrew-bottles"
# 重置 Homebrew
git -C "$(brew --repo)" remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/brew.git
git -C "$(brew --repo homebrew/core)" remote set-url origin https://mirrors.tuna.tsinghua.edu.cn/git/homebrew/homebrew-core.git
``` ```
## 配置数据库的地址 **2. 安装 ffmpeg**
```bash ```bash
MONGO_URI = "mongodb://192.168.28.9:27017/" brew install ffmpeg
```
## 检查ffmpeg是否下载
```
ffmpeg --version
# 未下载需要下载 brew install ffmpeg
``` ```
## 将数据写入到urls.txt中 #### 使用官方 Homebrew 安装
一行一个链接 ```bash
# 安装 Homebrew如果未安装
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
## 启动脚本 # 安装 ffmpeg
brew install ffmpeg
```
### Linux
#### Ubuntu/Debian
```bash
sudo apt update
sudo apt install ffmpeg
```
#### CentOS/RHEL/Fedora
**CentOS/RHEL**
```bash
sudo yum install epel-release
sudo yum install ffmpeg
```
**Fedora**
```bash
sudo dnf install ffmpeg
```
#### Arch Linux
```bash
sudo pacman -S ffmpeg
```
#### 使用 Snap 安装
```bash
sudo snap install ffmpeg
```
## 使用方法
### 1. 运行程序
```bash
python spider.py
```
### 2. 输入视频链接
程序启动后,会显示提示符:
```
==================================================
Bilibili Video Downloader
==================================================
Download worker started in background
Enter bilibili video URLs to download
Related videos will be automatically discovered and queued
Type 'quit' to exit
==================================================
input worker started, waiting for URLs...
Enter a bilibili URL (or 'quit' to exit):
>
```
### 3. 输入 B 站视频链接
支持的链接格式:
- `https://www.bilibili.com/video/BV1xx411c7mD`
- `https://b23.tv/xxxxx`
- 直接输入 bvid`BV1xx411c7mD`
示例:
```
> https://www.bilibili.com/video/BV1xx411c7mD
added to queue: BV1xx411c7mD, queue size: 1
```
### 4. 自动下载
- 程序会在后台自动下载队列中的视频
- 下载完成后会自动发现相关视频并添加到队列
- 已下载的视频会记录在 `downloaded_bvids.txt` 文件中,避免重复下载
### 5. 退出程序
```
> quit
stopping...
```
## 文件说明
```
spider/
├── spider.py # 主程序
├── downloaded_bvids.txt # 已下载视频记录(自动生成)
├── downloads/ # 下载目录(自动生成)
│ ├── BV1xx411c7mD_视频标题.mp4
│ └── ...
├── requirements.txt # Python 依赖
└── README.md # 本文件
```
## 注意事项
1. **首次运行**:程序会自动查找 ffmpeg请确保已正确安装
2. **下载位置**:视频默认下载到 `downloads/` 目录
3. **去重机制**:程序基于 bvid 进行去重,已下载的视频不会重复下载
4. **网络要求**:需要稳定的网络连接,建议使用国内网络环境
5. **存储空间**:请确保有足够的磁盘空间存储视频
## 常见问题
### Q: 提示 "ffmpeg not found"
A: 请按照上述安装教程安装 ffmpeg并确保添加到系统环境变量中。
### Q: 下载速度慢
A: 下载速度取决于网络环境和 B 站服务器,建议在网络较好的时段下载。
### Q: 如何重新下载已下载的视频
A: 删除 `downloaded_bvids.txt` 文件中对应的 bvid 行,或删除整个文件重新开始。
### Q: 程序卡住不动
A: 可能是网络问题,可以按 `Ctrl+C` 终止程序后重新运行。
## 许可证
MIT License
## 致谢
- [bilibili-api-python](https://github.com/MoyuScript/bilibili-api-python) - B 站 API 库
- [FFmpeg](https://ffmpeg.org/) - 音视频处理工具

391
spider.py
View File

@@ -3,32 +3,74 @@ import re
import subprocess import subprocess
import threading import threading
import time import time
import shutil
from pathlib import Path from pathlib import Path
from collections import deque
from bilibili_api import HEADERS, get_client, video # pip install bilibili-api-python from bilibili_api import HEADERS, get_client, video # pip install bilibili-api-python
from pymongo import MongoClient, ReturnDocument # pip install pymongo
MONGO_URI = "mongodb://192.168.28.9:27017/"
MONGO_DB_NAME = "bilibiliss"
VIDEO_COLLECTION = "bilibili_video_pool"
BASE_DIR = Path(__file__).resolve().parent BASE_DIR = Path(__file__).resolve().parent
URLS_FILE = BASE_DIR / "urls.txt"
OUTPUT_DIR = BASE_DIR / "downloads" OUTPUT_DIR = BASE_DIR / "downloads"
FFMPEG_PATH = "ffmpeg" DOWNLOADED_BVID_FILE = BASE_DIR / "downloaded_bvids.txt" # 已下载 bvid 记录文件
FFMPEG_PATH = None # Will be set by find_ffmpeg()
POLL_SECONDS = 2 POLL_SECONDS = 2
FETCH_RELATED_LIMIT = 20 FETCH_RELATED_LIMIT = 20
STATUS_INIT = "init"
STATUS_FETCHING = "fetching"
STATUS_READY = "ready"
STATUS_DOWNLOADING = "downloading"
STATUS_DONE = "done"
STATUS_ERROR = "error"
client = MongoClient(MONGO_URI) def find_ffmpeg():
collection = client[MONGO_DB_NAME][VIDEO_COLLECTION] """查找 ffmpeg 可执行文件的路径"""
# 首先尝试在系统 PATH 中查找
ffmpeg_path = shutil.which("ffmpeg")
if ffmpeg_path:
return ffmpeg_path
# Windows 常见的 ffmpeg 安装位置
common_paths = [
r"C:\ffmpeg\bin\ffmpeg.exe",
r"C:\Program Files\ffmpeg\bin\ffmpeg.exe",
r"C:\Program Files (x86)\ffmpeg\bin\ffmpeg.exe",
Path.home() / "ffmpeg" / "bin" / "ffmpeg.exe",
Path.home() / "Downloads" / "ffmpeg" / "bin" / "ffmpeg.exe",
BASE_DIR / "ffmpeg" / "bin" / "ffmpeg.exe",
BASE_DIR / "ffmpeg.exe",
]
for path in common_paths:
if Path(path).exists():
return str(path)
return None
# 使用线程安全的队列替代数据库
video_queue = deque() # 待下载的视频队列
downloaded_set = set() # 已下载的视频集合,防止重复
queue_lock = threading.Lock()
def load_downloaded_bvids():
"""从文件加载已下载的 bvid 列表"""
global downloaded_set
if DOWNLOADED_BVID_FILE.exists():
try:
with open(DOWNLOADED_BVID_FILE, "r", encoding="utf-8") as f:
downloaded_set = set(line.strip() for line in f if line.strip())
print(f"loaded {len(downloaded_set)} downloaded bvids from file")
except Exception as e:
print(f"failed to load downloaded bvids: {e}")
downloaded_set = set()
else:
print("no downloaded bvids file found, starting fresh")
downloaded_set = set()
def save_downloaded_bvid(bvid: str):
"""将 bvid 追加保存到文件"""
try:
with open(DOWNLOADED_BVID_FILE, "a", encoding="utf-8") as f:
f.write(bvid + "\n")
except Exception as e:
print(f"failed to save bvid {bvid}: {e}")
def get_bvid_from_url(url: str) -> str: def get_bvid_from_url(url: str) -> str:
@@ -56,85 +98,28 @@ def get_output_path(file_name: str) -> Path:
return OUTPUT_DIR / file_name return OUTPUT_DIR / file_name
def ensure_indexes(): def add_to_queue(bvid: str, source_url: str = ""):
collection.create_index("bvid", unique=True) """添加视频到下载队列"""
collection.create_index("status") with queue_lock:
if bvid not in downloaded_set and bvid not in [item["bvid"] for item in video_queue]:
video_queue.append({
"bvid": bvid,
"source_url": source_url or f"https://www.bilibili.com/video/{bvid}",
"added_at": time.time()
})
print(f"added to queue: {bvid}, queue size: {len(video_queue)}")
return True
else:
print(f"skipped duplicate: {bvid}")
return False
def reset_in_progress_docs(): def get_from_queue():
fetch_reset = collection.update_many( """从队列获取一个待下载的视频"""
{"status": STATUS_FETCHING}, with queue_lock:
{"$set": {"status": STATUS_INIT, "updated_at": time.time()}}, if video_queue:
) return video_queue.popleft()
download_reset = collection.update_many( return None
{"status": STATUS_DOWNLOADING},
{"$set": {"status": STATUS_READY, "updated_at": time.time()}},
)
print(
f"reset in-progress docs: fetching={fetch_reset.modified_count}, "
f"downloading={download_reset.modified_count}"
)
def seed_from_urls_file():
if not URLS_FILE.exists():
print(f"seed skipped, file not found: {URLS_FILE}")
return
inserted = 0
skipped = 0
with open(URLS_FILE, "r", encoding="utf-8") as file_obj:
for raw_line in file_obj:
url = raw_line.strip()
if not url:
continue
try:
bvid = get_bvid_from_url(url)
except ValueError:
skipped += 1
print(f"seed skipped, invalid url: {url}")
continue
result = collection.update_one(
{"bvid": bvid},
{
"$setOnInsert": {
"bvid": bvid,
"source_url": url,
"video_url": f"https://www.bilibili.com/video/{bvid}",
"title": "",
"download_file_name": f"{bvid}.mp4",
"status": STATUS_INIT,
"related_fetched": False,
"downloaded": False,
"created_at": time.time(),
}
},
upsert=True,
)
if result.upserted_id is not None:
inserted += 1
else:
skipped += 1
print(f"seed duplicate skipped: {bvid}")
print(f"seed complete, inserted={inserted}, skipped={skipped}")
def claim_fetch_doc():
return collection.find_one_and_update(
{"status": STATUS_INIT, "related_fetched": False},
{"$set": {"status": STATUS_FETCHING, "fetch_started_at": time.time()}},
return_document=ReturnDocument.AFTER,
)
def claim_download_doc():
return collection.find_one_and_update(
{"status": STATUS_READY, "downloaded": False},
{"$set": {"status": STATUS_DOWNLOADING, "download_started_at": time.time()}},
return_document=ReturnDocument.AFTER,
)
async def fetch_video_info_and_related(bvid: str): async def fetch_video_info_and_related(bvid: str):
@@ -145,6 +130,7 @@ async def fetch_video_info_and_related(bvid: str):
def save_related_bvids(parent_bvid: str, related_items): def save_related_bvids(parent_bvid: str, related_items):
"""将相关视频添加到队列"""
inserted = 0 inserted = 0
skipped = 0 skipped = 0
for item in related_items[:FETCH_RELATED_LIMIT]: for item in related_items[:FETCH_RELATED_LIMIT]:
@@ -152,32 +138,11 @@ def save_related_bvids(parent_bvid: str, related_items):
if not related_bvid: if not related_bvid:
continue continue
result = collection.update_one( if add_to_queue(related_bvid, f"https://www.bilibili.com/video/{related_bvid}"):
{"bvid": related_bvid},
{
"$setOnInsert": {
"bvid": related_bvid,
"source_url": f"https://www.bilibili.com/video/{related_bvid}",
"video_url": f"https://www.bilibili.com/video/{related_bvid}",
"title": item.get("title", ""),
"download_file_name": build_output_file_name(
related_bvid, item.get("title", "")
),
"status": STATUS_INIT,
"related_fetched": False,
"downloaded": False,
"parent_bvid": parent_bvid,
"created_at": time.time(),
}
},
upsert=True,
)
if result.upserted_id is not None:
inserted += 1 inserted += 1
print(f"related inserted: {related_bvid}")
else: else:
skipped += 1 skipped += 1
print(f"related duplicate skipped: {related_bvid}")
return inserted, skipped return inserted, skipped
@@ -196,6 +161,8 @@ async def download_stream(url: str, output_path: Path, intro: str):
def merge_media(video_path: Path, audio_path: Path, output_path: Path): def merge_media(video_path: Path, audio_path: Path, output_path: Path):
if not FFMPEG_PATH:
raise RuntimeError("ffmpeg not found. Please install ffmpeg and add it to PATH.")
subprocess.run( subprocess.run(
[ [
FFMPEG_PATH, FFMPEG_PATH,
@@ -215,6 +182,8 @@ def merge_media(video_path: Path, audio_path: Path, output_path: Path):
def convert_flv_to_mp4(source_path: Path, output_path: Path): def convert_flv_to_mp4(source_path: Path, output_path: Path):
if not FFMPEG_PATH:
raise RuntimeError("ffmpeg not found. Please install ffmpeg and add it to PATH.")
subprocess.run( subprocess.run(
[FFMPEG_PATH, "-y", "-i", str(source_path), str(output_path)], [FFMPEG_PATH, "-y", "-i", str(source_path), str(output_path)],
check=True, check=True,
@@ -248,128 +217,120 @@ async def download_video_file(bvid: str, output_file_name: str):
return final_path return final_path
def mark_doc_downloaded(doc, final_path: Path): def process_download(bvid: str):
collection.update_one( """处理单个视频的下载"""
{"_id": doc["_id"]},
{
"$set": {
"status": STATUS_DONE,
"downloaded": True,
"file_path": str(final_path),
"updated_at": time.time(),
}
},
)
def process_fetch_doc(doc):
bvid = doc["bvid"]
try: try:
# 获取视频信息
info, related_items = asyncio.run(fetch_video_info_and_related(bvid)) info, related_items = asyncio.run(fetch_video_info_and_related(bvid))
title = info.get("title", "") title = info.get("title", bvid)
file_name = build_output_file_name(bvid, title) file_name = build_output_file_name(bvid, title)
inserted, skipped = save_related_bvids(bvid, related_items)
collection.update_one(
{"_id": doc["_id"]},
{
"$set": {
"title": title,
"aid": info.get("aid"),
"cid": info.get("cid"),
"owner_name": (info.get("owner") or {}).get("name", ""),
"video_url": f"https://www.bilibili.com/video/{bvid}",
"download_file_name": file_name,
"related_fetched": True,
"status": STATUS_READY,
"related_inserted_count": inserted,
"related_skipped_count": skipped,
"updated_at": time.time(),
}
},
)
print(f"fetch done: {bvid}, related_inserted={inserted}, related_skipped={skipped}")
except Exception as exc:
collection.update_one(
{"_id": doc["_id"]},
{
"$set": {
"status": STATUS_ERROR,
"fetch_error": str(exc),
"updated_at": time.time(),
}
},
)
print(f"fetch failed: {bvid}, error={exc}")
print(f"fetching info done: {bvid}, title: {title}")
def process_download_doc(doc): # 检查文件是否已存在
bvid = doc["bvid"] final_path = get_output_path(file_name)
file_name = doc.get("download_file_name") or f"{bvid}.mp4" if final_path.exists():
final_path = get_output_path(file_name) print(f"file already exists, skipped: {bvid} -> {final_path}")
with queue_lock:
downloaded_set.add(bvid)
return
if doc.get("downloaded") and final_path.exists(): # 下载视频
print(f"download already marked and file exists, skipped: {bvid}")
return
if final_path.exists():
mark_doc_downloaded(doc, final_path)
print(f"download file already exists, skipped: {bvid} -> {final_path}")
return
try:
final_path = asyncio.run(download_video_file(bvid, file_name)) final_path = asyncio.run(download_video_file(bvid, file_name))
mark_doc_downloaded(doc, final_path)
print(f"download done: {bvid} -> {final_path}") print(f"download done: {bvid} -> {final_path}")
# 标记为已下载并保存到文件
with queue_lock:
downloaded_set.add(bvid)
save_downloaded_bvid(bvid)
# 添加相关视频到队列
inserted, skipped = save_related_bvids(bvid, related_items)
print(f"related videos: inserted={inserted}, skipped={skipped}")
except Exception as exc: except Exception as exc:
collection.update_one(
{"_id": doc["_id"]},
{
"$set": {
"status": STATUS_READY,
"download_error": str(exc),
"updated_at": time.time(),
}
},
)
print(f"download failed: {bvid}, error={exc}") print(f"download failed: {bvid}, error={exc}")
def fetch_worker():
while True:
doc = claim_fetch_doc()
if not doc:
time.sleep(POLL_SECONDS)
continue
process_fetch_doc(doc)
def download_worker(): def download_worker():
"""下载工作线程:从队列中取出视频并下载"""
print("download worker started")
while True: while True:
doc = claim_download_doc() item = get_from_queue()
if not doc: if not item:
time.sleep(POLL_SECONDS) time.sleep(POLL_SECONDS)
continue continue
process_download_doc(doc)
bvid = item["bvid"]
print(f"processing: {bvid}")
process_download(bvid)
def input_worker():
"""输入工作线程:接收用户输入的链接"""
print("input worker started, waiting for URLs...")
print("Enter a bilibili URL (or 'quit' to exit):")
while True:
try:
url = input("> ").strip()
if url.lower() == 'quit':
print("stopping...")
break
if not url:
continue
try:
bvid = get_bvid_from_url(url)
add_to_queue(bvid, url)
except ValueError as e:
print(f"invalid url: {e}")
print("please enter a valid bilibili video URL")
except EOFError:
break
except KeyboardInterrupt:
break
def main(): def main():
ensure_indexes() global FFMPEG_PATH
reset_in_progress_docs()
seed_from_urls_file()
fetch_thread = threading.Thread(target=fetch_worker, daemon=True, name="fetch-worker") # 查找 ffmpeg
download_thread = threading.Thread( FFMPEG_PATH = find_ffmpeg()
target=download_worker, daemon=True, name="download-worker" if not FFMPEG_PATH:
) print("=" * 50)
fetch_thread.start() print("ERROR: ffmpeg not found!")
print("=" * 50)
print("Please install ffmpeg and add it to your system PATH.")
print("Download from: https://ffmpeg.org/download.html")
print("Or place ffmpeg.exe in one of these locations:")
print(" - C:\\ffmpeg\\bin\\")
print(" - Current directory")
print("=" * 50)
return
print(f"ffmpeg found at: {FFMPEG_PATH}")
# 加载已下载的 bvid 列表
load_downloaded_bvids()
# 启动下载工作线程
download_thread = threading.Thread(target=download_worker, daemon=True, name="download-worker")
download_thread.start() download_thread.start()
print("workers started: fetch-worker, download-worker") print("=" * 50)
try: print("Bilibili Video Downloader")
while True: print("=" * 50)
time.sleep(1) print("Download worker started in background")
except KeyboardInterrupt: print("Enter bilibili video URLs to download")
print("stopping...") print("Related videos will be automatically discovered and queued")
print("Type 'quit' to exit")
print("=" * 50)
# 主线程处理用户输入
input_worker()
if __name__ == "__main__": if __name__ == "__main__":