Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 52 additions & 16 deletions backend/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import subprocess
import os
import glob
import tempfile

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
Expand Down Expand Up @@ -533,28 +534,63 @@ def clean_transcript(file_path):

return " ".join(transcript_lines).strip()


YOUTUBE_VIDEO_ID_PATTERN = re.compile(r"^[A-Za-z0-9_-]{11}$")


def is_valid_youtube_video_id(video_id):
return bool(YOUTUBE_VIDEO_ID_PATTERN.fullmatch(video_id or ""))

Copy link

Copilot AI Mar 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be a blank line between the top-level helper is_valid_youtube_video_id and the @app.route decorator below it to keep top-level definitions separated consistently (PEP8-style; most other endpoints in this file are separated by blank lines).

Suggested change

Copilot uses AI. Check for mistakes.
@app.route('/getTranscript', methods=['GET'])
def get_transcript():
video_id = request.args.get('videoId')
video_id = (request.args.get('videoId') or '').strip()
if not video_id:
return jsonify({"error": "No video ID provided"}), 400

subprocess.run(["yt-dlp", "--write-auto-sub", "--sub-lang", "en", "--skip-download",
"--sub-format", "vtt", "-o", f"subtitles/{video_id}.vtt", f"https://www.youtube.com/watch?v={video_id}"],
check=True, capture_output=True, text=True)

# Find the latest .vtt file in the "subtitles" folder
subtitle_files = glob.glob("subtitles/*.vtt")
if not subtitle_files:
return jsonify({"error": "No subtitles found"}), 404
if not is_valid_youtube_video_id(video_id):
return jsonify({"error": "Invalid YouTube video ID format"}), 400

latest_subtitle = max(subtitle_files, key=os.path.getctime)
transcript_text = clean_transcript(latest_subtitle)

# Optional: Clean up the file after reading
os.remove(latest_subtitle)

return jsonify({"transcript": transcript_text})
try:
# Use per-request temp storage to avoid collisions across concurrent requests.
with tempfile.TemporaryDirectory(prefix="eduaid_subs_") as temp_dir:
subprocess.run(
[
"yt-dlp",
"--write-auto-sub",
"--sub-lang",
"en",
"--skip-download",
"--sub-format",
"vtt",
"-o",
os.path.join(temp_dir, "%(id)s.%(ext)s"),
f"https://www.youtube.com/watch?v={video_id}",
],
check=True,
capture_output=True,
text=True,
timeout=60,
)

subtitle_files = glob.glob(os.path.join(temp_dir, "*.vtt"))
if not subtitle_files:
return jsonify({"error": "No subtitles found"}), 404

latest_subtitle = max(subtitle_files, key=os.path.getctime)
Copy link

Copilot AI Mar 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

os.path.getctime is platform-dependent (on Unix it’s metadata-change time, not creation time). For selecting the newest subtitle file, prefer a deterministic strategy (e.g., getmtime, or selecting the expected filename for the requested video_id) to avoid surprising picks when multiple .vtt files exist.

Suggested change
latest_subtitle = max(subtitle_files, key=os.path.getctime)
latest_subtitle = max(subtitle_files, key=os.path.getmtime)

Copilot uses AI. Check for mistakes.
transcript_text = clean_transcript(latest_subtitle)
if not transcript_text:
return jsonify({"error": "Transcript is empty"}), 404

return jsonify({"transcript": transcript_text})
except subprocess.TimeoutExpired as err:
app.logger.exception("yt-dlp timeout in /getTranscript: %s", err)
return jsonify({"error": "Transcript extraction timed out"}), 504
except subprocess.CalledProcessError as err:
app.logger.exception("yt-dlp failed in /getTranscript: %s", err)
return jsonify({"error": "Failed to fetch transcript"}), 502
except Exception as err:
app.logger.exception("Unhandled exception in /getTranscript: %s", err)
return jsonify({"error": "Internal server error"}), 500

if __name__ == "__main__":
os.makedirs("subtitles", exist_ok=True)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Remove dead code: subtitles directory is no longer used.

The transcript endpoint now uses per-request temporary directories. This os.makedirs("subtitles", ...) line creates a directory that is never used, leaving stale code behind.

🧹 Suggested removal
 if __name__ == "__main__":
-    os.makedirs("subtitles", exist_ok=True)
     app.run()
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@backend/server.py` at line 596, Remove the dead directory creation call
os.makedirs("subtitles", exist_ok=True) from server.py because the transcript
endpoint now uses per-request temporary directories; locate the invocation of
os.makedirs("subtitles", exist_ok=True) and delete that statement (and any
unused import of os if it becomes unused) so no stale "subtitles" directory is
created.

Expand Down
Loading