-
-
Notifications
You must be signed in to change notification settings - Fork 431
fix(transcript): validate videoId and harden yt-dlp error handling #619
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -5,6 +5,7 @@ | |||||
| import subprocess | ||||||
| import os | ||||||
| import glob | ||||||
| import tempfile | ||||||
|
|
||||||
| from sklearn.metrics.pairwise import cosine_similarity | ||||||
| from sklearn.feature_extraction.text import TfidfVectorizer | ||||||
|
|
@@ -533,28 +534,63 @@ def clean_transcript(file_path): | |||||
|
|
||||||
| return " ".join(transcript_lines).strip() | ||||||
|
|
||||||
|
|
||||||
| YOUTUBE_VIDEO_ID_PATTERN = re.compile(r"^[A-Za-z0-9_-]{11}$") | ||||||
|
|
||||||
|
|
||||||
| def is_valid_youtube_video_id(video_id): | ||||||
| return bool(YOUTUBE_VIDEO_ID_PATTERN.fullmatch(video_id or "")) | ||||||
|
|
||||||
| @app.route('/getTranscript', methods=['GET']) | ||||||
| def get_transcript(): | ||||||
| video_id = request.args.get('videoId') | ||||||
| video_id = (request.args.get('videoId') or '').strip() | ||||||
| if not video_id: | ||||||
| return jsonify({"error": "No video ID provided"}), 400 | ||||||
|
|
||||||
| subprocess.run(["yt-dlp", "--write-auto-sub", "--sub-lang", "en", "--skip-download", | ||||||
| "--sub-format", "vtt", "-o", f"subtitles/{video_id}.vtt", f"https://www.youtube.com/watch?v={video_id}"], | ||||||
| check=True, capture_output=True, text=True) | ||||||
|
|
||||||
| # Find the latest .vtt file in the "subtitles" folder | ||||||
| subtitle_files = glob.glob("subtitles/*.vtt") | ||||||
| if not subtitle_files: | ||||||
| return jsonify({"error": "No subtitles found"}), 404 | ||||||
| if not is_valid_youtube_video_id(video_id): | ||||||
| return jsonify({"error": "Invalid YouTube video ID format"}), 400 | ||||||
|
|
||||||
| latest_subtitle = max(subtitle_files, key=os.path.getctime) | ||||||
| transcript_text = clean_transcript(latest_subtitle) | ||||||
|
|
||||||
| # Optional: Clean up the file after reading | ||||||
| os.remove(latest_subtitle) | ||||||
|
|
||||||
| return jsonify({"transcript": transcript_text}) | ||||||
| try: | ||||||
| # Use per-request temp storage to avoid collisions across concurrent requests. | ||||||
| with tempfile.TemporaryDirectory(prefix="eduaid_subs_") as temp_dir: | ||||||
| subprocess.run( | ||||||
| [ | ||||||
| "yt-dlp", | ||||||
| "--write-auto-sub", | ||||||
| "--sub-lang", | ||||||
| "en", | ||||||
| "--skip-download", | ||||||
| "--sub-format", | ||||||
| "vtt", | ||||||
| "-o", | ||||||
| os.path.join(temp_dir, "%(id)s.%(ext)s"), | ||||||
| f"https://www.youtube.com/watch?v={video_id}", | ||||||
| ], | ||||||
| check=True, | ||||||
| capture_output=True, | ||||||
| text=True, | ||||||
| timeout=60, | ||||||
| ) | ||||||
|
|
||||||
| subtitle_files = glob.glob(os.path.join(temp_dir, "*.vtt")) | ||||||
| if not subtitle_files: | ||||||
| return jsonify({"error": "No subtitles found"}), 404 | ||||||
|
|
||||||
| latest_subtitle = max(subtitle_files, key=os.path.getctime) | ||||||
|
||||||
| latest_subtitle = max(subtitle_files, key=os.path.getctime) | |
| latest_subtitle = max(subtitle_files, key=os.path.getmtime) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove dead code: subtitles directory is no longer used.
The transcript endpoint now uses per-request temporary directories. This os.makedirs("subtitles", ...) line creates a directory that is never used, leaving stale code behind.
🧹 Suggested removal
if __name__ == "__main__":
- os.makedirs("subtitles", exist_ok=True)
app.run()🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.
In `@backend/server.py` at line 596, Remove the dead directory creation call
os.makedirs("subtitles", exist_ok=True) from server.py because the transcript
endpoint now uses per-request temporary directories; locate the invocation of
os.makedirs("subtitles", exist_ok=True) and delete that statement (and any
unused import of os if it becomes unused) so no stale "subtitles" directory is
created.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There should be a blank line between the top-level helper
is_valid_youtube_video_idand the@app.routedecorator below it to keep top-level definitions separated consistently (PEP8-style; most other endpoints in this file are separated by blank lines).