AOSSIE-Org · Aditya30ag · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/backend/Generator/main.py → backend/Generator/advanced_qa.py b/backend/Generator/main.py → backend/Generator/advanced_qa.py
@@ -1,387 +1,12 @@
-import time
 import torch
 import random
-from transformers import T5ForConditionalGeneration, T5Tokenizer
-from transformers import AutoModelForSequenceClassification, AutoTokenizer,AutoModelForSeq2SeqLM, T5ForConditionalGeneration, T5Tokenizer
-import numpy as np
-import spacy
-from sense2vec import Sense2Vec
-from collections import OrderedDict
-from nltk import FreqDist
-from nltk.corpus import brown
-from similarity.normalized_levenshtein import NormalizedLevenshtein
-from Generator.mcq import tokenize_into_sentences, identify_keywords, find_sentences_with_keywords, generate_multiple_choice_questions, generate_normal_questions
-from Generator.encoding import beam_search_decoding
-from google.oauth2 import service_account
-from googleapiclient.discovery import build
-import en_core_web_sm
 import json
 import re
+import numpy as np
 from typing import Any, List, Mapping, Tuple
-import re
-import os
-import fitz 
-import mammoth
-
-class MCQGenerator:
-
-    def __init__(self):
-        self.tokenizer = T5Tokenizer.from_pretrained('t5-large')
-        self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator')
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-        self.nlp = spacy.load('en_core_web_sm')
-        self.s2v = Sense2Vec().from_disk('s2v_old')
-        self.fdist = FreqDist(brown.words())
-        self.normalized_levenshtein = NormalizedLevenshtein()
-        self.set_seed(42)
-
-    def set_seed(self, seed):
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-
-    def generate_mcq(self, payload):
-        start_time = time.time()
-        inp = {
-            "input_text": payload.get("input_text"),
-            "max_questions": payload.get("max_questions", 4)
-        }
-
-        text = inp['input_text']
-        sentences = tokenize_into_sentences(text)
-        modified_text = " ".join(sentences)
-
-        keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
-        keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)
-
-        for k in keyword_sentence_mapping.keys():
-            text_snippet = " ".join(keyword_sentence_mapping[k][:3])
-            keyword_sentence_mapping[k] = text_snippet
-
-        final_output = {}
-
-        if len(keyword_sentence_mapping.keys()) == 0:
-            return final_output
-        else:
-            try:
-                generated_questions = generate_multiple_choice_questions(keyword_sentence_mapping, self.device, self.tokenizer, self.model, self.s2v, self.normalized_levenshtein)
-            except:
-                return final_output
-
-            end_time = time.time()
-
-            final_output["statement"] = modified_text
-            final_output["questions"] = generated_questions["questions"]
-            final_output["time_taken"] = end_time - start_time
-
-            if self.device.type == 'cuda':
-                torch.cuda.empty_cache()
-
-            return final_output
-
-class ShortQGenerator:
-
-    def __init__(self):
-        self.tokenizer = T5Tokenizer.from_pretrained('t5-large')
-        self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator')
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-        self.nlp = spacy.load('en_core_web_sm')
-        self.s2v = Sense2Vec().from_disk('s2v_old')
-        self.fdist = FreqDist(brown.words())
-        self.normalized_levenshtein = NormalizedLevenshtein()
-        self.set_seed(42)
-
-    def set_seed(self, seed):
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-
-    def generate_shortq(self, payload):
-        inp = {
-            "input_text": payload.get("input_text"),
-            "max_questions": payload.get("max_questions", 4)
-        }
-
-        text = inp['input_text']
-        sentences = tokenize_into_sentences(text)
-        modified_text = " ".join(sentences)
-
-        keywords = identify_keywords(self.nlp, modified_text, inp['max_questions'], self.s2v, self.fdist, self.normalized_levenshtein, len(sentences))
-        keyword_sentence_mapping = find_sentences_with_keywords(keywords, sentences)
-
-        for k in keyword_sentence_mapping.keys():
-            text_snippet = " ".join(keyword_sentence_mapping[k][:3])
-            keyword_sentence_mapping[k] = text_snippet
-
-        final_output = {}
-
-        if len(keyword_sentence_mapping.keys()) == 0:
-            return final_output
-        else:
-            generated_questions = generate_normal_questions(keyword_sentence_mapping, self.device, self.tokenizer, self.model)
-
-        final_output["statement"] = modified_text
-        final_output["questions"] = generated_questions["questions"]
-
-        if self.device.type == 'cuda':
-            torch.cuda.empty_cache()
-
-        return final_output
-
-class ParaphraseGenerator:
-
-    def __init__(self):
-        self.tokenizer = T5Tokenizer.from_pretrained('t5-large')
-        self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Question-Generator')
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-        self.set_seed(42)
-
-    def set_seed(self, seed):
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-
-    def generate_paraphrase(self, payload):
-        start_time = time.time()
-        inp = {
-            "input_text": payload.get("input_text"),
-            "max_questions": payload.get("max_questions", 3)
-        }
-
-        text = inp['input_text']
-        num = inp['max_questions']
-
-        sentence = text
-        text_to_paraphrase = "paraphrase: " + sentence + " </s>"
-
-        encoding = self.tokenizer.encode_plus(text_to_paraphrase, pad_to_max_length=True, return_tensors="pt")
-        input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
-
-        beam_outputs = self.model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_masks,
-            max_length=50,
-            num_beams=50,
-            num_return_sequences=num,
-            no_repeat_ngram_size=2,
-            early_stopping=True
-            )
-
-        final_outputs =[]
-        for beam_output in beam_outputs:
-            paraphrased_sentence = self.tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-            if paraphrased_sentence.lower() != sentence.lower() and paraphrased_sentence not in final_outputs:
-                final_outputs.append(paraphrased_sentence)
-
-        output = {}
-        output['Original Sentence'] = sentence
-        output['Count'] = num
-        output['Paraphrased Questions'] = final_outputs
-
-        if self.device.type == 'cuda':
-            torch.cuda.empty_cache()
-
-        return output
-
-class BoolQGenerator:
-
-    def __init__(self):
-        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
-        self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Boolean-Questions')
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-        self.set_seed(42)
-
-    def set_seed(self, seed):
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-
-    def random_choice(self):
-        a = random.choice([0,1])
-        return bool(a)
-
-
-    def generate_boolq(self, payload):
-        start_time = time.time()
-        inp = {
-            "input_text": payload.get("input_text"),
-            "max_questions": payload.get("max_questions", 4)
-        }
-
-        text = inp['input_text']
-        num= inp['max_questions']
-        sentences = tokenize_into_sentences(text)
-        modified_text = " ".join(sentences)
-        answer = self.random_choice()
-        form = "truefalse: %s passage: %s </s>" % (modified_text, answer)
-        print(form)
-        encoding = self.tokenizer.encode_plus(form, return_tensors="pt")
-        input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
-
-        output = beam_search_decoding (input_ids, attention_masks, self.model, self.tokenizer,num)
-        if self.device.type == 'cuda':
-            torch.cuda.empty_cache()
-
-        final= {}
-        final['Text']= text
-        final['Count']= num
-        final['Boolean_Questions']= output
-
-        return final
-
-
-class AnswerPredictor:
-
-    def __init__(self):
-        self.tokenizer = T5Tokenizer.from_pretrained('t5-large', model_max_length=512)
-        self.model = T5ForConditionalGeneration.from_pretrained('Roasters/Answer-Predictor')
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-
-        # Load the lightweight NLI model for boolean question answering
-        self.nli_model_name = "typeform/distilbert-base-uncased-mnli"
-        self.nli_tokenizer = AutoTokenizer.from_pretrained(self.nli_model_name)
-        self.nli_model = AutoModelForSequenceClassification.from_pretrained(self.nli_model_name)
-
-        self.set_seed(42)
-
-    def set_seed(self, seed):
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(seed)
-
-    def greedy_decoding(self, inp_ids, attn_mask):
-        greedy_output = self.model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256)
-        Question = self.tokenizer.decode(greedy_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        return Question.strip().capitalize()
-
-    def predict_answer(self, payload):
-        answers = []
-        inp = {
-                "input_text": payload.get("input_text"),
-                "input_question" : payload.get("input_question")
-            }
-        for ques in payload.get("input_question"):
-
-            context = inp["input_text"]
-            question = ques
-            input_text = "question: %s <s> context: %s </s>" % (question, context)
-
-            encoding = self.tokenizer.encode_plus(input_text, return_tensors="pt")
-            input_ids, attention_masks = encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
-            greedy_output = self.model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length=256)
-            Question = self.tokenizer.decode(greedy_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
-            answers.append(Question.strip().capitalize())
-
-        if self.device.type == 'cuda':
-            torch.cuda.empty_cache()
-
-        return answers
-
-    def predict_boolean_answer(self, payload):
-        input_text = payload.get("input_text", "")
-        input_questions = payload.get("input_question", [])
-
-        answers = []
-
-        for question in input_questions:
-            hypothesis = question
-            inputs = self.nli_tokenizer.encode_plus(input_text, hypothesis, return_tensors="pt")
-            outputs = self.nli_model(**inputs)
-            logits = outputs.logits
-            probabilities = torch.softmax(logits, dim=1)
-            entailment_prob = probabilities[0][0].item()
-            contradiction_prob = probabilities[0][2].item()
-
-            if entailment_prob > contradiction_prob:
-                answers.append(True)
-            else:
-                answers.append(False)
-
-        if self.device.type == 'cuda':
-            torch.cuda.empty_cache()
-
-        return answers
-
-class GoogleDocsService:
-    def __init__(self, service_account_file, scopes):
-        self.credentials = service_account.Credentials.from_service_account_file(
-            service_account_file, scopes=scopes)
-        self.docs_service = build('docs', 'v1', credentials=self.credentials)
-
-    @staticmethod
-    def extract_document_id(url):
-        """
-        Extracts the Google Docs document ID from a given URL.
-        """
-        match = re.search(r'/document/d/([^/]+)', url)
-        if match:
-            return match.group(1)
-        return None
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForSequenceClassification
+import en_core_web_sm
 
-    def get_document_content(self, document_url):
-        """
-        Retrieves the content of a Google Docs document given its URL.
-        """
-        document_id = self.extract_document_id(document_url)
-        if not document_id:
-            raise ValueError('Invalid document URL')
-
-        response = self.docs_service.documents().get(documentId=document_id).execute()
-        doc = response.get('body', {})
-
-        text = ''
-        for element in doc.get('content', []):
-            if 'paragraph' in element:
-                for p in element['paragraph']['elements']:
-                    if 'textRun' in p:
-                        text += p['textRun']['content']
-
-        return text.strip()
-
-
-class FileProcessor:
-    def __init__(self, upload_folder='uploads/'):
-        self.upload_folder = upload_folder
-        if not os.path.exists(self.upload_folder):
-            os.makedirs(self.upload_folder)
-
-    def extract_text_from_pdf(self, file_path):
-        doc = fitz.open(file_path)
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        return text
-
-    def extract_text_from_docx(self, file_path):
-        with open(file_path, "rb") as docx_file:
-            result = mammoth.extract_raw_text(docx_file)
-            return result.value
-
-    def process_file(self, file):
-        file_path = os.path.join(self.upload_folder, file.filename)
-        file.save(file_path)
-        content = ""
-
-        if file.filename.endswith('.txt'):
-            with open(file_path, 'r') as f:
-                content = f.read()
-        elif file.filename.endswith('.pdf'):
-            content = self.extract_text_from_pdf(file_path)
-        elif file.filename.endswith('.docx'):
-            content = self.extract_text_from_docx(file_path)
-
-        os.remove(file_path)
-        return content
 
 class QuestionGenerator:
     """A transformer-based NLP system for generating reading comprehension-style questions from