""" from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline import torch class NLU: def_tokenizer = AutoTokenizer.from_pretrained("castorini/t5-base-canard") def_model = AutoModelForSeq2SeqLM.from_pretrained("castorini/t5-base-canard") def_intent_classifier = pipeline("sentiment-analysis", model="/home/ahmed/PycharmProjects/Janet/JanetBackend/intent_classifier") def __init__(self, model=def_model, tokenizer=def_tokenizer, intent_classifier=def_intent_classifier, max_history_length=1024, num_gen_seq=2, score_threshold=0.5): self.input = "" self.output = "" self.model = model self.tokenizer = tokenizer self.max_length = max_history_length self.num_return_sequences = num_gen_seq self.score_threshold = score_threshold self.label2id = {'Greet': 0, 'Bye': 1, 'GetKnowledge': 2, 'ChitChat': 3} self.id2label = {0: 'Greet', 1: 'Bye', 2: 'GetKnowledge', 3: 'ChitChat'} self.intent_classifier = intent_classifier def process_utterance(self, utterance, history): if len(history) > 0: # crop history while len(history.split(" ")) > self.max_length: index = history.find("|||") history = history[index + 4:] self.input = history + " ||| " + utterance inputs = self.tokenizer(self.input, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt") candidates = self.model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], return_dict_in_generate=True, output_scores=True, num_return_sequences=self.num_return_sequences, num_beams=self.num_return_sequences) for i in range(candidates["sequences"].shape[0]): generated_sentence = self.tokenizer.decode(candidates["sequences"][i], skip_special_tokens=True, clean_up_tokenization_spaces=True) log_scores = candidates['sequences_scores'] norm_prob = (torch.exp(log_scores[i]) / torch.exp(log_scores).sum()).item() if norm_prob >= self.score_threshold: self.score_threshold = norm_prob self.output = generated_sentence else: self.output = utterance intent = self.label2id[self.intent_classifier(self.output)[0]['label']] intent_conf = self.intent_classifier(self.output)[0]['score'] return {"modified_prompt": self.output, "mod_confidence": self.score_threshold, "prompt_intent": intent, "intent_confidence": intent_conf} """ import threading import spacy import spacy_transformers import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline class NLU: def __init__(self, device, device_flag, reference_resolver, tokenizer, intent_classifier, offense_filter, entity_extractor, max_history_length=1024): #entity_extractor=def_entity_extractor self.reference_resolver = reference_resolver self.device = device self.reference_resolver.to(device) self.tokenizer = tokenizer self.max_length = max_history_length self.label2idintent = {'QA': 0, 'CHITCHAT': 1, 'FINDPAPER': 2, 'FINDDATASET': 3, 'SUMMARIZEPAPER': 4} self.id2labelintent = {0: 'QA', 1: 'CHITCHAT', 2: 'FINDPAPER', 3: 'FINDDATASET', 4: 'SUMMARIZEPAPER'} self.label2idoffense = {'hate': 0, 'offensive': 1, 'neither': 2} self.id2labeloffense = {0: 'hate', 1: 'offensive', 2: 'neither'} self.intent_classifier = pipeline("sentiment-analysis", model=intent_classifier, device=device_flag) self.entity_extractor = entity_extractor self.offense_filter = pipeline("sentiment-analysis", model=offense_filter, device=device_flag) self.intents = None self.entities = None self.offensive = None self.clear = True def _intentpredictor(self): self.intents = self.label2idintent[self.intent_classifier(self.to_process)[0]['label']] def _entityextractor(self): self.entities = [] doc = self.entity_extractor(self.to_process) for entity in doc.ents: if entity.text not in ['.', ',', '?', ';']: self.entities.append({'entity': entity.label_, 'value': entity.text}) def _inappropriatedetector(self): self.offensive = False is_offensive = self.label2idoffense[self.offense_filter(self.to_process)[0]['label']] if is_offensive == 0 or is_offensive == 1: self.offensive = True def process_utterance(self, utterance, history): """ Given an utterance and the history of the conversation, refine the query contextually and return a refined utterance """ self.to_process = utterance if len(history) > 0: # crop history while len(history.split(" ")) > self.max_length: index = history.find("|||") history = history[index + 4:] context = history + " ||| " + utterance inputs = self.tokenizer(context, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt") candidates = self.reference_resolver.generate(input_ids=inputs["input_ids"].to(self.device), attention_mask=inputs["attention_mask"].to(self.device), return_dict_in_generate=True, output_scores=True, num_return_sequences=1, num_beams=5) self.to_process = self.tokenizer.decode(candidates["sequences"][0], skip_special_tokens=True, clean_up_tokenization_spaces=True) t1 = threading.Thread(target=self._intentpredictor, name='intent') t2 = threading.Thread(target=self._entityextractor, name='entity') t3 = threading.Thread(target=self._inappropriatedetector, name='offensive') t3.start() t1.start() t2.start() t3.join() t1.join() t2.join() return {"modified_prompt": self.to_process, "intent": self.intents, "entities": self.entities, "is_offensive": self.offensive, "is_clear": self.clear}