This commit is contained in:
ahmed531998 2023-04-21 04:08:30 +02:00
parent 6456dca7d2
commit 0615194c7f
4 changed files with 62 additions and 60 deletions

View File

@ -2,12 +2,12 @@ FROM python:3.8.10-slim
WORKDIR /backend_janet
COPY requirements_simple.txt .
COPY requirements_main.txt .
RUN pip install -r requirements_simple.txt
RUN pip install -r requirements_main.txt
RUN rm -fr /root/.cache/*
COPY . .
ENTRYPOINT ["python", "main_simple.py"]
ENTRYPOINT ["python", "main.py"]

View File

@ -5,6 +5,7 @@ from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from datetime import datetime
from datasets import Dataset
class ResponseGenerator:
def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3):
@ -81,11 +82,17 @@ class ResponseGenerator:
return []
def _search_index(self, index_type, db_type, query, multi=False):
xq = self.retriever.encode([query])
D, I = self.index[index_type].search(xq, self.num_retrieved)
self.index[index_type].add_faiss_index(column="embeddings")
scores, samples = self.index[index_type].get_nearest_examples(
"embeddings", retriever.encode([query]), k=3
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
if multi:
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True)
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True).loc[0]
return samples_df.reset_index(drop=True)
return samples_df.iloc[0].reset_index(drop=True)
def gen_response(self, action, utterance=None, name=None, username=None, vrename=None, state=None, consec_history=None, chitchat_history=None):

90
VRE.py
View File

@ -14,7 +14,7 @@ import urllib.request
import time
import threading
import html2text
from datasets import Dataset
class VRE:
def __init__(self, name, token, retriever, directory='/app/'):
@ -35,12 +35,12 @@ class VRE:
'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created', 'url']),
'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),
'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}
self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'),
'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'),
'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'),
'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'),
'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'),
'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')}
self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_titles_index'),
'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else Dataset.load_from_disk(self.directory + 'janet_paper_titles_index'),
'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_desc_index'),
'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else Dataset.load_from_disk(self.directory + 'janet_paper_desc_index'),
'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else Dataset.load_from_disk(self.directory + 'janet_content_index'),
'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else Dataset.load_from_disk(self.directory + 'janet_post_index')}
self.new_income = False
def init(self):
@ -48,69 +48,59 @@ class VRE:
self.get_content()
if self.index['dataset_titles_index'] is None:
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
#self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
if self.index['dataset_desc_index'] is None:
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
#self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
if self.index['paper_titles_index'] is None:
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
#self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
if self.index['paper_desc_index'] is None:
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
#self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
if self.index['content_index'] is None:
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
#self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
if self.index['post_index'] is None:
self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
#self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
def index_periodic_update(self):
if self.new_income:
if len(self.db['content_db'])%100 != 0:
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
#self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
if len(self.db['post_db'])%100 != 0:
self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
#self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
if len(self.db['paper_db'])%100 != 0:
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
#self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
#self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
if len(self.db['dataset_db'])%100 != 0:
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
#self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
#self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
self.new_income = False
def create_index(self, db_type, attribute, index_type, filename):
filename = self.directory + filename
to_index = self.db[db_type][attribute]
for i, info in enumerate(to_index):
if i == 0:
emb = self.retriever.encode([info])
sentence_embeddings = np.array(emb)
else:
emb = self.retriever.encode([info])
sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)
# number of partitions of the coarse quantizer = number of posting lists
# as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
code_size = 8 # = number of subquantizers = number of sub-vectors
n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings))) # n_bits of each code (8 -> 1 byte codes)
d = sentence_embeddings.shape[1]
coarse_quantizer = faiss.IndexFlatL2(d) # will keep centroids of coarse quantizer (for inverted list)
self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
self.index[index_type].train(sentence_embeddings) # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
faiss.write_index(self.index[index_type], filename)
dataset = Dataset.from_pandas(self.db[db_type])
embeddings_dataset = dataset.map(
lambda x: {"embeddings": self.retriever.encode([x[attribute]])[0]}
)
embeddings_dataset.save_to_disk(filename)
self.index[index_type] = embeddings_dataset
#faiss.write_index(self.index[index_type], filename)
def populate_index(self, db_type, attribute, index_type, filename):
filename = self.directory + filename
@ -185,15 +175,15 @@ class VRE:
self.dataset_counter += 1
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]
self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True)
self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True)
self.db['post_db'] = post_df.sort_values(by='time', ascending=True).reset_index(drop=True)
self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
self.db['post_db'] = post_df.sort_values(by='time', ascending=True)
other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
#other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
for i, post in post_df.iterrows():
if post['author'] != "Catalogue":
if post['author'] != "catalogue":
self.content_counter+=1
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
"""
for i, description in dataset_df.iterrows():
self.content_counter+=1
@ -202,7 +192,7 @@ class VRE:
self.content_counter+=1
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -3, description['title'] + ' is a paper. ' + description['notes'] + ' It is about ' + ', '.join(description['tags']) ]
"""
self.db['content_db'] = pd.concat([content_df, other_content_df]).reset_index(drop=True)
self.db['content_db'] = content_df
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
@ -282,17 +272,17 @@ class VRE:
self.dataset_counter += 1
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]
self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True)
self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True)
self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)]).reset_index(drop=True)
self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])
self.db['post_db'].to_json(self.directory+self.name+'_post.json')
other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
for i, post in post_df.iterrows():
if post['author'] != "Catalogue":
if post['author'] != "catalogue":
self.content_counter+=1
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
"""
for i, description in dataset_df.iterrows():
self.content_counter+=1
@ -304,7 +294,7 @@ class VRE:
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
self.db['content_db'] = pd.concat([self.db['content_db'], content_df, other_content_df]).reset_index(drop=True)
self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
self.db['content_db'].to_json(self.directory + self.name + '_content.json')
if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:
self.new_income = True

11
main.py
View File

@ -46,11 +46,16 @@ def vre_fetch():
rg.update_index(vre.get_index())
rg.update_db(vre.get_db())
def user_interest_decay(token):
while True:
print("decaying interests after 3 minutes for " + users[token]['username'])
time.sleep(180)
users[token]['user'].decay_interests()
if token in users:
print("decaying interests after 3 minutes for " + users[token]['username'])
time.sleep(180)
users[token]['user'].decay_interests()
else:
break
def clear_inactive():
while True: