trial
This commit is contained in:
parent
4a28e90975
commit
019281a217
|
@ -2,12 +2,12 @@ FROM python:3.8.10-slim
|
|||
|
||||
WORKDIR /backend_janet
|
||||
|
||||
COPY requirements_simple.txt .
|
||||
COPY requirements_main.txt .
|
||||
|
||||
RUN pip install -r requirements_simple.txt
|
||||
RUN pip install -r requirements_main.txt
|
||||
|
||||
RUN rm -fr /root/.cache/*
|
||||
|
||||
COPY . .
|
||||
|
||||
ENTRYPOINT ["python", "main_simple.py"]
|
||||
ENTRYPOINT ["python", "main.py"]
|
||||
|
|
|
@ -5,6 +5,7 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from datasets import Dataset
|
||||
|
||||
class ResponseGenerator:
|
||||
def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3):
|
||||
|
@ -81,11 +82,17 @@ class ResponseGenerator:
|
|||
return []
|
||||
|
||||
def _search_index(self, index_type, db_type, query, multi=False):
|
||||
xq = self.retriever.encode([query])
|
||||
D, I = self.index[index_type].search(xq, self.num_retrieved)
|
||||
self.index[index_type].add_faiss_index(column="embeddings")
|
||||
scores, samples = self.index[index_type].get_nearest_examples(
|
||||
"embeddings", retriever.encode([query]), k=3
|
||||
)
|
||||
samples_df = pd.DataFrame.from_dict(samples)
|
||||
samples_df["scores"] = scores
|
||||
samples_df.sort_values("scores", ascending=False, inplace=True)
|
||||
|
||||
if multi:
|
||||
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True)
|
||||
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True).loc[0]
|
||||
return samples_df.reset_index(drop=True)
|
||||
return samples_df.iloc[0].reset_index(drop=True)
|
||||
|
||||
|
||||
def gen_response(self, action, utterance=None, name=None, username=None, vrename=None, state=None, consec_history=None, chitchat_history=None):
|
||||
|
|
90
VRE.py
90
VRE.py
|
@ -14,7 +14,7 @@ import urllib.request
|
|||
import time
|
||||
import threading
|
||||
import html2text
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
class VRE:
|
||||
def __init__(self, name, token, retriever, directory='/app/'):
|
||||
|
@ -35,12 +35,12 @@ class VRE:
|
|||
'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created', 'url']),
|
||||
'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),
|
||||
'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}
|
||||
self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'),
|
||||
'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'),
|
||||
'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'),
|
||||
'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'),
|
||||
'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'),
|
||||
'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')}
|
||||
self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_titles_index'),
|
||||
'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else Dataset.load_from_disk(self.directory + 'janet_paper_titles_index'),
|
||||
'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_desc_index'),
|
||||
'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else Dataset.load_from_disk(self.directory + 'janet_paper_desc_index'),
|
||||
'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else Dataset.load_from_disk(self.directory + 'janet_content_index'),
|
||||
'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else Dataset.load_from_disk(self.directory + 'janet_post_index')}
|
||||
self.new_income = False
|
||||
|
||||
def init(self):
|
||||
|
@ -48,69 +48,59 @@ class VRE:
|
|||
self.get_content()
|
||||
if self.index['dataset_titles_index'] is None:
|
||||
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
#self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
|
||||
if self.index['dataset_desc_index'] is None:
|
||||
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
#self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
|
||||
if self.index['paper_titles_index'] is None:
|
||||
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
#self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
|
||||
if self.index['paper_desc_index'] is None:
|
||||
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
#self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
|
||||
if self.index['content_index'] is None:
|
||||
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
#self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
|
||||
if self.index['post_index'] is None:
|
||||
self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
#self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
|
||||
def index_periodic_update(self):
|
||||
if self.new_income:
|
||||
if len(self.db['content_db'])%100 != 0:
|
||||
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
#self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
if len(self.db['post_db'])%100 != 0:
|
||||
self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
#self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
if len(self.db['paper_db'])%100 != 0:
|
||||
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
#self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
#self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
if len(self.db['dataset_db'])%100 != 0:
|
||||
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
#self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
#self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.new_income = False
|
||||
|
||||
def create_index(self, db_type, attribute, index_type, filename):
|
||||
filename = self.directory + filename
|
||||
to_index = self.db[db_type][attribute]
|
||||
for i, info in enumerate(to_index):
|
||||
if i == 0:
|
||||
emb = self.retriever.encode([info])
|
||||
sentence_embeddings = np.array(emb)
|
||||
else:
|
||||
emb = self.retriever.encode([info])
|
||||
sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)
|
||||
|
||||
# number of partitions of the coarse quantizer = number of posting lists
|
||||
# as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
|
||||
nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
|
||||
code_size = 8 # = number of subquantizers = number of sub-vectors
|
||||
n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings))) # n_bits of each code (8 -> 1 byte codes)
|
||||
d = sentence_embeddings.shape[1]
|
||||
coarse_quantizer = faiss.IndexFlatL2(d) # will keep centroids of coarse quantizer (for inverted list)
|
||||
self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
|
||||
self.index[index_type].train(sentence_embeddings) # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
|
||||
faiss.write_index(self.index[index_type], filename)
|
||||
dataset = Dataset.from_pandas(self.db[db_type])
|
||||
embeddings_dataset = dataset.map(
|
||||
lambda x: {"embeddings": self.retriever.encode([x[attribute]])[0]}
|
||||
)
|
||||
embeddings_dataset.save_to_disk(filename)
|
||||
self.index[index_type] = embeddings_dataset
|
||||
#faiss.write_index(self.index[index_type], filename)
|
||||
|
||||
def populate_index(self, db_type, attribute, index_type, filename):
|
||||
filename = self.directory + filename
|
||||
|
@ -185,15 +175,15 @@ class VRE:
|
|||
self.dataset_counter += 1
|
||||
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]
|
||||
|
||||
self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True)
|
||||
self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True)
|
||||
self.db['post_db'] = post_df.sort_values(by='time', ascending=True).reset_index(drop=True)
|
||||
self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
|
||||
self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
|
||||
self.db['post_db'] = post_df.sort_values(by='time', ascending=True)
|
||||
|
||||
other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
|
||||
#other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
|
||||
for i, post in post_df.iterrows():
|
||||
if post['author'] != "Catalogue":
|
||||
if post['author'] != "catalogue":
|
||||
self.content_counter+=1
|
||||
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
|
||||
content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
|
||||
"""
|
||||
for i, description in dataset_df.iterrows():
|
||||
self.content_counter+=1
|
||||
|
@ -202,7 +192,7 @@ class VRE:
|
|||
self.content_counter+=1
|
||||
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -3, description['title'] + ' is a paper. ' + description['notes'] + ' It is about ' + ', '.join(description['tags']) ]
|
||||
"""
|
||||
self.db['content_db'] = pd.concat([content_df, other_content_df]).reset_index(drop=True)
|
||||
self.db['content_db'] = content_df
|
||||
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
|
||||
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
|
||||
|
||||
|
@ -282,17 +272,17 @@ class VRE:
|
|||
self.dataset_counter += 1
|
||||
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]
|
||||
|
||||
self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True)
|
||||
self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True)
|
||||
self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
|
||||
self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
|
||||
|
||||
self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)]).reset_index(drop=True)
|
||||
|
||||
self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])
|
||||
self.db['post_db'].to_json(self.directory+self.name+'_post.json')
|
||||
|
||||
other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
|
||||
for i, post in post_df.iterrows():
|
||||
if post['author'] != "Catalogue":
|
||||
if post['author'] != "catalogue":
|
||||
self.content_counter+=1
|
||||
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
|
||||
content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
|
||||
"""
|
||||
for i, description in dataset_df.iterrows():
|
||||
self.content_counter+=1
|
||||
|
@ -304,7 +294,7 @@ class VRE:
|
|||
|
||||
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
|
||||
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
|
||||
self.db['content_db'] = pd.concat([self.db['content_db'], content_df, other_content_df]).reset_index(drop=True)
|
||||
self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
|
||||
self.db['content_db'].to_json(self.directory + self.name + '_content.json')
|
||||
if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:
|
||||
self.new_income = True
|
||||
|
|
11
main.py
11
main.py
|
@ -46,11 +46,16 @@ def vre_fetch():
|
|||
rg.update_index(vre.get_index())
|
||||
rg.update_db(vre.get_db())
|
||||
|
||||
|
||||
def user_interest_decay(token):
|
||||
while True:
|
||||
print("decaying interests after 3 minutes for " + users[token]['username'])
|
||||
time.sleep(180)
|
||||
users[token]['user'].decay_interests()
|
||||
if token in users:
|
||||
print("decaying interests after 3 minutes for " + users[token]['username'])
|
||||
time.sleep(180)
|
||||
users[token]['user'].decay_interests()
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def clear_inactive():
|
||||
while True:
|
||||
|
|
Loading…
Reference in New Issue