JanetBackEnd/VRE.py

from datetime import datetime
import pandas as pd
import string
import re
import requests
import os
from io import BytesIO
from tqdm.auto import tqdm
import numpy as np
import math
import faiss
import pdfquery
import urllib.request
import time
import threading
import html2text


class VRE:
    def __init__(self, name, token, retriever, directory='/app/'):
        self.name = name
        self.token = token
        self.catalogue_url = 'https://api.d4science.org/catalogue/items/'
        self.socialnetwork_url = 'https://api.d4science.org/rest/2/posts/get-posts-vre/'
        self.headers = headers = {"gcube-token": self.token, "Accept": "application/json"}
        self.lastupdatetime = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()
        self.postlastupdate = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()
        self.retriever = retriever
        self.directory = directory
        self.post_counter = 0
        self.paper_counter = 0
        self.dataset_counter = 0
        self.content_counter = 0
        self.db = {'paper_db': pd.read_json(self.directory + self.name + '_paper.json') if os.path.isfile(self.directory + self.name + '_paper.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),
                   'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']), 
                   'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),
                   'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}
        self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'),
                  'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'),
                  'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'),
                  'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'),
                  'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'),
                  'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')}
        self.new_income = False

    def init(self):
        #first run 
        if not os.path.isfile(self.directory + self.name + '_dataset' + '.json') or not os.path.isfile(self.directory + self.name + '_paper' + '.json') or not os.path.isfile(self.directory + self.name + '_content' + '.json') or not os.path.isfile(self.directory + self.name + '_post' + '.json'): 
            self.get_content()
        if self.index['dataset_titles_index'] is None:
            self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
            self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
                
        if self.index['dataset_desc_index'] is None:
            self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
            self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
            
        if self.index['paper_titles_index'] is None:
            self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
            self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
            
        if self.index['paper_desc_index'] is None:
            self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
            self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
            
        if self.index['content_index'] is None:
            self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
            self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
          
        if self.index['post_index'] is None:
            self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
            self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')

    def index_periodic_update(self):
        if self.new_income:         
            if len(self.db['content_db'])%100 != 0:
                self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
                self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
            if len(self.db['post_db'])%100 != 0:
                self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
                self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
            if len(self.db['paper_db'])%100 != 0:
                self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
                self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
                self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
                self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
            if len(self.db['dataset_db'])%100 != 0:
                self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
                self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
                self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
                self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
            self.new_income = False

    def create_index(self, db_type, attribute, index_type, filename):
        filename = self.directory + filename
        to_index = self.db[db_type][attribute]
        for i, info in enumerate(to_index):
            if i == 0:
                emb = self.retriever.encode([info])
                sentence_embeddings = np.array(emb)
            else:
                emb = self.retriever.encode([info])
                sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)

        # number of partitions of the coarse quantizer = number of posting lists
        # as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
        nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
        code_size = 8  # = number of subquantizers = number of sub-vectors
        n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings)))  # n_bits of each code (8 -> 1 byte codes)
        d = sentence_embeddings.shape[1]
        coarse_quantizer = faiss.IndexFlatL2(d)  # will keep centroids of coarse quantizer (for inverted list)
        self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
        self.index[index_type].train(sentence_embeddings)  # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
        faiss.write_index(self.index[index_type], filename)
    
    def populate_index(self, db_type, attribute, index_type, filename):
        filename = self.directory + filename
        to_index = self.db[db_type][attribute]
        for info in to_index:
            sentence_embedding = np.array(self.retriever.encode([info])) 
            self.index[index_type].add(sentence_embedding)
        faiss.write_index(self.index[index_type], filename)

    def get_content(self):
        h = html2text.HTML2Text()
        h.ignore_links = True
        #posts 
        posts = requests.get(self.socialnetwork_url, headers=self.headers)
        posts = posts.json()['result']
        post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])
        
        for post in posts:
            author = post['full_name'].lower()
            content = h.handle(post['description']).replace('\n', ' ').lower()
            date = post['time']
            tags = []
            for word in content.split():
              if word[0] == '#':
                tags.append(word[1:])
            if date > self.postlastupdate:
                self.postlastupdate = date
            self.post_counter += 1
            post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]          

        #catalog
        response = requests.get(self.catalogue_url, headers=self.headers)
        items = response.json()
        items_data = []
        for item in items:
            api_url = self.catalogue_url + item + '/'
            response = requests.get(api_url, headers=self.headers)
            items_data.append(response.json())

        keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']
        
        paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
        dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
        content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
        
        for item in items_data:
            for el in item['extras']:
                if el['key'] == 'system:type':
                    rsrc = el['value']
            resources = []
            for resource in item['resources']:
                resources.append(
                    {'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
            tags = []
            for tag in item['tags']:
                tags.append(tag['name'].lower())
            title = item['title'].lower()
            author = item['author'].lower()
            notes = item['notes'].lower()
            date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
            if date > self.lastupdatetime:
                self.lastupdatetime = date
            if rsrc == 'Paper':
              self.paper_counter += 1
              paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
              content_df = self.get_pdf_content(item, content_df)
              content_df = self.get_txt_content(item, content_df)
            if rsrc == 'Dataset':
               self.dataset_counter += 1
               dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]

        self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
        self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
        self.db['post_db'] = post_df.sort_values(by='time', ascending=True)
        self.db['content_db'] = content_df

        self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
        self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')

        self.db['content_db'].to_json(self.directory + self.name + '_content.json')
        self.db['post_db'].to_json(self.directory+self.name+'_post.json')

    # modify query
    def get_vre_update(self):
        print("Getting new items")
        h = html2text.HTML2Text()
        h.ignore_links = True
        #posts 
        posts = requests.get(self.socialnetwork_url, headers=self.headers)
        posts = posts.json()['result']
        new_posts = []
        for post in posts:
            if post['time'] > self.postlastupdate:
              new_posts.append(post)
          
        post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])
          
        for post in new_posts:
          author = post['full_name'].lower()
          content = h.handle(post['description']).replace('\n', ' ').lower()
          date = post['time']
          tags = []
          for word in content.split():
              if word[0] == '#':
                  tags.append(word[1:])
          if date > self.postlastupdate:
              self.postlastupdate = date
          self.post_counter += 1
          post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]          

        #catalog
        response = requests.get(self.catalogue_url, headers=self.headers)
        items = response.json()
        items_data = []
        for item in items:
            api_url = self.catalogue_url + item + '/'
            response = requests.get(api_url, headers=self.headers)
            if datetime.strptime(response.json()['metadata_created'],'%Y-%m-%dT%H:%M:%S.%f').timestamp() > self.lastupdatetime:
                items_data.append(response.json())

        keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']
          
        paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
        dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
        content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
      
        for item in items_data:
            for el in item['extras']:
                if el['key'] == 'system:type':
                    rsrc = el['value']
            resources = []
            for resource in item['resources']:
                resources.append(
                    {'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
            tags = []
            for tag in item['tags']:
                tags.append(tag['name'].lower())
            title = item['title'].lower()
            author = item['author'].lower()
            notes = item['notes'].lower()
            date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
            if date > self.lastupdatetime:
                self.lastupdatetime = date

            if rsrc == 'Paper':
                self.paper_counter += 1
                paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
                content_df = self.get_pdf_content(item, content_df)
                content_df = self.get_txt_content(item, content_df)
            if rsrc == 'Dataset':
                self.dataset_counter += 1
                dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]

        self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
        self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
        
        self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])
        self.db['post_db'].to_json(self.directory+self.name+'_post.json')
        
        self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
        self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
        self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
        self.db['content_db'].to_json(self.directory + self.name + '_content.json')
        if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:
            self.new_income = True            
    

    def remove_useless_dots(self, line):
        modline = ''
        for i in range(0, len(line)):
            if line[i] != '.':
                modline+=line[i]
            if line[i] == '.':
                if line[i-2] == ' ' or line[i-2] in string.punctuation:
                    continue
                if line[i-1] == '.':
                    continue
                if line[i-3] == ' ' and line[i-2] == 'a' and line[i-1] == 'l':
                    continue
                modline+=line[i]
        modline = re.sub(r'\.+', ".", modline)
        return modline

    def check_if_sentence(self, sentence):
        if (len(sentence.split())) > 9 or '.' in sentence:
            return True
        return False
    
    def get_abstract(self, text): 
        abstract_start = 0
        abstract_end = len(text)
        for i in range(0, len(text)):
            if len(text[i].split()) > 0:
                words = text[i].split()
                if words[0].lower() == 'abstract':
                    abstract_start = i
                    for j in range(i+1, len(text)):
                        if len(text[j]) == 0 and j > i+5:
                            abstract_end = j
                            break
                    break
        return abstract_start, abstract_end    
      
    def useful_index(self, text):
        start = 0
        end = len(text)
        for i in range(0, len(text)):
            if len(text[i].split()) > 0:
                words = text[i].split()
                if words[0].lower() in ['bibliography','references']:
                    if i < end:
                        end = i
                if words[0].lower() in ['introduction', '1 introduction', '1. introduction', '1.introduction']:
                    start = i
                if words[0].lower() in ['acknowledgement', 'acknowledgements']:
                    if i < end:
                        end = i
        return start, end
      
    def get_line_sentences(self, text, i):
        mytext = self.remove_useless_dots(text[i])
        if self.check_if_sentence(mytext):
            splits = mytext.split('.')
            for j in range(len(splits)):
                if j+1 < len(splits):
                    splits[j] = splits[j]+'. '
                if j == len(splits)-1:
                    splits[j] = splits[j].removesuffix('-')
            return splits, i+1
        else:
            return [], i+1

    def parts_to_sentences(self, parts):
        sentences = []
        sentence = ''
        for part in parts:
            sentence += part
            if '.' in sentence:
                sentences.append(sentence)
                sentence = ''
        return sentences
      
    def get_pdf_content(self, item, df):
        for rsrc in tqdm(item['resources']):
            response = requests.get(rsrc['url'])
            if 'application/pdf' in response.headers.get('content-type'):
                urllib.request.urlretrieve(rsrc['url'], self.directory + "janet.pdf")
                pdf = pdfquery.PDFQuery(self.directory + "janet.pdf")
                pdf.load()
                #pages = pdf.pq('LTPage')
                text = []
              
                for i, el in enumerate(pdf.tree.getiterator()):
                    if el.tag == 'LTTextLineHorizontal' or el.tag == 'LTTextBoxHorizontal':
                        text.append(el.text)
                  
                paragraphs = []
                parts = []
                i, end = self.useful_index(text)
                while i < end:
                    sent, i = self.get_line_sentences(text, i)
                    for part in sent:
                        if part!='':
                            x = part
                            if len(part) > 1 and part[0] == ' ':
                                x = part[1:]
                            if len(part) > 2 and part[1] == ' ':
                                x = part[2:]
                            parts.append(x)
                sentences = self.parts_to_sentences(parts)
                for i in range(0, len(sentences)-4, 5):
                    paragraph = sentences[i] + sentences[i+1] + sentences[i+2] + sentences[i+3] + sentences[i+4]
                    paragraphs.append(paragraph) 
                for paragraph in tqdm(paragraphs):
                    self.content_counter += 1
                    df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]
          
                start, end = self.get_abstract(text)
                abstract = ''
                for i in range(start, end):
                    abstract += text[i]
                self.content_counter += 1
                df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, abstract]
        return df

    def get_txt_content(self, item, df):
        for rsrc in tqdm(item['resources']):
            response = requests.get(rsrc['url'])
            if 'text/plain' in response.headers.get('content-type'):
                content = response.text
                content = self.remove_useless_dots(content)
                sentences = content.split('.')
                paragraphs = []
                for i in range(0, len(sentences)-4, 5):
                    paragraph = sentences[i] + '. ' + sentences[i+1]+ '. ' + sentences[i+2]+ '. ' + sentences[i+3] + '. ' + sentences[i+4]+ '. '
                    paragraphs.append(paragraph) 
                for paragraph in tqdm(paragraphs):
                    self.content_counter += 1
                    df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]
        return df
    def get_db(self):
        return self.db
    def get_index(self):
        return self.index
backendrepo 2023-03-30 15:17:54 +02:00			`from datetime import datetime`
			`import pandas as pd`
enhance_janet 2023-04-15 10:52:01 +02:00			`import string`
			`import re`
backendrepo 2023-03-30 15:17:54 +02:00			`import requests`
			`import os`
			`from io import BytesIO`
			`from tqdm.auto import tqdm`
			`import numpy as np`
			`import math`
			`import faiss`
enhance_janet 2023-04-15 10:52:01 +02:00			`import pdfquery`
			`import urllib.request`
backendrepo 2023-03-30 15:17:54 +02:00			`import time`
			`import threading`
enhance_janet 2023-04-15 10:52:01 +02:00			`import html2text`

backendrepo 2023-03-30 15:17:54 +02:00
			`class VRE:`
new_workdir 2023-04-08 22:51:44 +02:00			`def __init__(self, name, token, retriever, directory='/app/'):`
backendrepo 2023-03-30 15:17:54 +02:00			`self.name = name`
			`self.token = token`
			`self.catalogue_url = 'https://api.d4science.org/catalogue/items/'`
enhance_janet 2023-04-15 10:52:01 +02:00			`self.socialnetwork_url = 'https://api.d4science.org/rest/2/posts/get-posts-vre/'`
backendrepo 2023-03-30 15:17:54 +02:00			`self.headers = headers = {"gcube-token": self.token, "Accept": "application/json"}`
			`self.lastupdatetime = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()`
enhance_janet 2023-04-15 10:52:01 +02:00			`self.postlastupdate = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()`
backendrepo 2023-03-30 15:17:54 +02:00			`self.retriever = retriever`
			`self.directory = directory`
enhance_janet 2023-04-15 10:52:01 +02:00			`self.post_counter = 0`
backendrepo 2023-03-30 15:17:54 +02:00			`self.paper_counter = 0`
			`self.dataset_counter = 0`
			`self.content_counter = 0`
			`self.db = {'paper_db': pd.read_json(self.directory + self.name + '_paper.json') if os.path.isfile(self.directory + self.name + '_paper.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),`
			`'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),`
enhance_janet 2023-04-15 10:52:01 +02:00			`'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),`
			`'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}`
new_workdir 2023-04-08 22:51:44 +02:00			`self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'),`
			`'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'),`
			`'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'),`
			`'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'),`
enhance_janet 2023-04-15 10:52:01 +02:00			`'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'),`
			`'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')}`
backendrepo 2023-03-30 15:17:54 +02:00			`self.new_income = False`

			`def init(self):`
enhance_janet 2023-04-15 10:52:01 +02:00			`#first run`
			`if not os.path.isfile(self.directory + self.name + '_dataset' + '.json') or not os.path.isfile(self.directory + self.name + '_paper' + '.json') or not os.path.isfile(self.directory + self.name + '_content' + '.json') or not os.path.isfile(self.directory + self.name + '_post' + '.json'):`
			`self.get_content()`
			`if self.index['dataset_titles_index'] is None:`
			`self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')`
			`self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')`

			`if self.index['dataset_desc_index'] is None:`
			`self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')`
			`self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')`

			`if self.index['paper_titles_index'] is None:`
			`self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')`
			`self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')`

			`if self.index['paper_desc_index'] is None:`
			`self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')`
			`self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')`

			`if self.index['content_index'] is None:`
			`self.create_index('content_db', 'content', 'content_index', 'janet_content_index')`
			`self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')`

			`if self.index['post_index'] is None:`
			`self.create_index('post_db', 'content', 'post_index', 'janet_post_index')`
			`self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')`
backendrepo 2023-03-30 15:17:54 +02:00
			`def index_periodic_update(self):`
enhance_janet 2023-04-15 10:52:01 +02:00			`if self.new_income:`
			`if len(self.db['content_db'])%100 != 0:`
			`self.create_index('content_db', 'content', 'content_index', 'janet_content_index')`
			`self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')`
			`if len(self.db['post_db'])%100 != 0:`
			`self.create_index('post_db', 'content', 'post_index', 'janet_post_index')`
			`self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')`
			`if len(self.db['paper_db'])%100 != 0:`
			`self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')`
			`self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')`
			`self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')`
			`self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')`
			`if len(self.db['dataset_db'])%100 != 0:`
			`self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')`
			`self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')`
			`self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')`
			`self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')`
			`self.new_income = False`
backendrepo 2023-03-30 15:17:54 +02:00
			`def create_index(self, db_type, attribute, index_type, filename):`
enhance_janet 2023-04-15 10:52:01 +02:00			`filename = self.directory + filename`
			`to_index = self.db[db_type][attribute]`
			`for i, info in enumerate(to_index):`
			`if i == 0:`
			`emb = self.retriever.encode([info])`
			`sentence_embeddings = np.array(emb)`
			`else:`
			`emb = self.retriever.encode([info])`
			`sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)`

			`# number of partitions of the coarse quantizer = number of posting lists`
			`# as rule of thumb, 4sqrt(N) < nlist < 16sqrt(N), where N is the size of the database`
			`nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1`
			`code_size = 8 # = number of subquantizers = number of sub-vectors`
			`n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings))) # n_bits of each code (8 -> 1 byte codes)`
			`d = sentence_embeddings.shape[1]`
			`coarse_quantizer = faiss.IndexFlatL2(d) # will keep centroids of coarse quantizer (for inverted list)`
			`self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)`
			`self.index[index_type].train(sentence_embeddings) # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)`
			`faiss.write_index(self.index[index_type], filename)`
backendrepo 2023-03-30 15:17:54 +02:00
			`def populate_index(self, db_type, attribute, index_type, filename):`
enhance_janet 2023-04-15 10:52:01 +02:00			`filename = self.directory + filename`
			`to_index = self.db[db_type][attribute]`
			`for info in to_index:`
			`sentence_embedding = np.array(self.retriever.encode([info]))`
			`self.index[index_type].add(sentence_embedding)`
			`faiss.write_index(self.index[index_type], filename)`
backendrepo 2023-03-30 15:17:54 +02:00
			`def get_content(self):`
enhance_janet 2023-04-15 10:52:01 +02:00			`h = html2text.HTML2Text()`
			`h.ignore_links = True`
			`#posts`
syntax_fix 2023-04-15 16:29:38 +02:00			`posts = requests.get(self.socialnetwork_url, headers=self.headers)`
enhance_janet 2023-04-15 10:52:01 +02:00			`posts = posts.json()['result']`
			`post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])`

			`for post in posts:`
			`author = post['full_name'].lower()`
			`content = h.handle(post['description']).replace('\n', ' ').lower()`
			`date = post['time']`
			`tags = []`
			`for word in content.split():`
			`if word[0] == '#':`
			`tags.append(word[1:])`
			`if date > self.postlastupdate:`
			`self.postlastupdate = date`
			`self.post_counter += 1`
			`post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]`

			`#catalog`
backendrepo 2023-03-30 15:17:54 +02:00			`response = requests.get(self.catalogue_url, headers=self.headers)`
			`items = response.json()`
			`items_data = []`
			`for item in items:`
			`api_url = self.catalogue_url + item + '/'`
			`response = requests.get(api_url, headers=self.headers)`
			`items_data.append(response.json())`

			`keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']`

			`paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])`
			`dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])`
			`content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])`

			`for item in items_data:`
			`for el in item['extras']:`
			`if el['key'] == 'system:type':`
			`rsrc = el['value']`
			`resources = []`
			`for resource in item['resources']:`
			`resources.append(`
			`{'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})`
			`tags = []`
			`for tag in item['tags']:`
			`tags.append(tag['name'].lower())`
			`title = item['title'].lower()`
			`author = item['author'].lower()`
			`notes = item['notes'].lower()`
			`date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()`
			`if date > self.lastupdatetime:`
			`self.lastupdatetime = date`
			`if rsrc == 'Paper':`
			`self.paper_counter += 1`
			`paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]`
			`content_df = self.get_pdf_content(item, content_df)`
			`content_df = self.get_txt_content(item, content_df)`
			`if rsrc == 'Dataset':`
			`self.dataset_counter += 1`
			`dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]`

			`self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)`
			`self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)`
enhance_janet 2023-04-15 10:52:01 +02:00			`self.db['post_db'] = post_df.sort_values(by='time', ascending=True)`
backendrepo 2023-03-30 15:17:54 +02:00			`self.db['content_db'] = content_df`

new_workdir 2023-04-08 22:51:44 +02:00			`self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')`
			`self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')`
backendrepo 2023-03-30 15:17:54 +02:00
new_workdir 2023-04-08 22:51:44 +02:00			`self.db['content_db'].to_json(self.directory + self.name + '_content.json')`
enhance_janet 2023-04-15 10:52:01 +02:00			`self.db['post_db'].to_json(self.directory+self.name+'_post.json')`
backendrepo 2023-03-30 15:17:54 +02:00
			`# modify query`
			`def get_vre_update(self):`
enhance_janet 2023-04-15 10:52:01 +02:00			`print("Getting new items")`
			`h = html2text.HTML2Text()`
			`h.ignore_links = True`
			`#posts`
syntax_fix 2023-04-15 16:29:38 +02:00			`posts = requests.get(self.socialnetwork_url, headers=self.headers)`
enhance_janet 2023-04-15 10:52:01 +02:00			`posts = posts.json()['result']`
			`new_posts = []`
			`for post in posts:`
			`if post['time'] > self.postlastupdate:`
			`new_posts.append(post)`

			`post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])`

			`for post in new_posts:`
			`author = post['full_name'].lower()`
			`content = h.handle(post['description']).replace('\n', ' ').lower()`
			`date = post['time']`
			`tags = []`
			`for word in content.split():`
			`if word[0] == '#':`
			`tags.append(word[1:])`
			`if date > self.postlastupdate:`
			`self.postlastupdate = date`
			`self.post_counter += 1`
			`post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]`

			`#catalog`
			`response = requests.get(self.catalogue_url, headers=self.headers)`
			`items = response.json()`
			`items_data = []`
			`for item in items:`
			`api_url = self.catalogue_url + item + '/'`
			`response = requests.get(api_url, headers=self.headers)`
			`if datetime.strptime(response.json()['metadata_created'],'%Y-%m-%dT%H:%M:%S.%f').timestamp() > self.lastupdatetime:`
			`items_data.append(response.json())`

			`keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']`

			`paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])`
			`dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])`
			`content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])`

			`for item in items_data:`
			`for el in item['extras']:`
			`if el['key'] == 'system:type':`
			`rsrc = el['value']`
			`resources = []`
			`for resource in item['resources']:`
			`resources.append(`
			`{'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})`
			`tags = []`
			`for tag in item['tags']:`
			`tags.append(tag['name'].lower())`
			`title = item['title'].lower()`
			`author = item['author'].lower()`
			`notes = item['notes'].lower()`
			`date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()`
			`if date > self.lastupdatetime:`
			`self.lastupdatetime = date`

			`if rsrc == 'Paper':`
			`self.paper_counter += 1`
			`paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]`
			`content_df = self.get_pdf_content(item, content_df)`
			`content_df = self.get_txt_content(item, content_df)`
			`if rsrc == 'Dataset':`
			`self.dataset_counter += 1`
			`dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]`

			`self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])`
			`self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])`
backendrepo 2023-03-30 15:17:54 +02:00
enhance_janet 2023-04-15 10:52:01 +02:00			`self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])`
			`self.db['post_db'].to_json(self.directory+self.name+'_post.json')`

			`self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')`
			`self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')`
			`self.db['content_db'] = pd.concat([self.db['content_db'], content_df])`
			`self.db['content_db'].to_json(self.directory + self.name + '_content.json')`
			`if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:`
			`self.new_income = True`
backendrepo 2023-03-30 15:17:54 +02:00

enhance_janet 2023-04-15 10:52:01 +02:00			`def remove_useless_dots(self, line):`
			`modline = ''`
			`for i in range(0, len(line)):`
			`if line[i] != '.':`
			`modline+=line[i]`
			`if line[i] == '.':`
			`if line[i-2] == ' ' or line[i-2] in string.punctuation:`
			`continue`
			`if line[i-1] == '.':`
			`continue`
			`if line[i-3] == ' ' and line[i-2] == 'a' and line[i-1] == 'l':`
			`continue`
			`modline+=line[i]`
			`modline = re.sub(r'\.+', ".", modline)`
			`return modline`

			`def check_if_sentence(self, sentence):`
			`if (len(sentence.split())) > 9 or '.' in sentence:`
			`return True`
			`return False`

			`def get_abstract(self, text):`
			`abstract_start = 0`
			`abstract_end = len(text)`
			`for i in range(0, len(text)):`
			`if len(text[i].split()) > 0:`
			`words = text[i].split()`
			`if words[0].lower() == 'abstract':`
			`abstract_start = i`
			`for j in range(i+1, len(text)):`
			`if len(text[j]) == 0 and j > i+5:`
			`abstract_end = j`
			`break`
			`break`
			`return abstract_start, abstract_end`

			`def useful_index(self, text):`
			`start = 0`
			`end = len(text)`
			`for i in range(0, len(text)):`
			`if len(text[i].split()) > 0:`
			`words = text[i].split()`
			`if words[0].lower() in ['bibliography','references']:`
			`if i < end:`
			`end = i`
			`if words[0].lower() in ['introduction', '1 introduction', '1. introduction', '1.introduction']:`
			`start = i`
			`if words[0].lower() in ['acknowledgement', 'acknowledgements']:`
			`if i < end:`
			`end = i`
			`return start, end`

			`def get_line_sentences(self, text, i):`
			`mytext = self.remove_useless_dots(text[i])`
			`if self.check_if_sentence(mytext):`
			`splits = mytext.split('.')`
			`for j in range(len(splits)):`
			`if j+1 < len(splits):`
			`splits[j] = splits[j]+'. '`
			`if j == len(splits)-1:`
			`splits[j] = splits[j].removesuffix('-')`
			`return splits, i+1`
			`else:`
			`return [], i+1`

			`def parts_to_sentences(self, parts):`
			`sentences = []`
			`sentence = ''`
			`for part in parts:`
			`sentence += part`
			`if '.' in sentence:`
			`sentences.append(sentence)`
			`sentence = ''`
			`return sentences`

backendrepo 2023-03-30 15:17:54 +02:00			`def get_pdf_content(self, item, df):`
enhance_janet 2023-04-15 10:52:01 +02:00			`for rsrc in tqdm(item['resources']):`
			`response = requests.get(rsrc['url'])`
			`if 'application/pdf' in response.headers.get('content-type'):`
			`urllib.request.urlretrieve(rsrc['url'], self.directory + "janet.pdf")`
			`pdf = pdfquery.PDFQuery(self.directory + "janet.pdf")`
			`pdf.load()`
			`#pages = pdf.pq('LTPage')`
			`text = []`

			`for i, el in enumerate(pdf.tree.getiterator()):`
			`if el.tag == 'LTTextLineHorizontal' or el.tag == 'LTTextBoxHorizontal':`
			`text.append(el.text)`

			`paragraphs = []`
			`parts = []`
			`i, end = self.useful_index(text)`
			`while i < end:`
			`sent, i = self.get_line_sentences(text, i)`
			`for part in sent:`
			`if part!='':`
			`x = part`
			`if len(part) > 1 and part[0] == ' ':`
			`x = part[1:]`
			`if len(part) > 2 and part[1] == ' ':`
			`x = part[2:]`
			`parts.append(x)`
			`sentences = self.parts_to_sentences(parts)`
			`for i in range(0, len(sentences)-4, 5):`
			`paragraph = sentences[i] + sentences[i+1] + sentences[i+2] + sentences[i+3] + sentences[i+4]`
			`paragraphs.append(paragraph)`
			`for paragraph in tqdm(paragraphs):`
			`self.content_counter += 1`
			`df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]`

			`start, end = self.get_abstract(text)`
			`abstract = ''`
			`for i in range(start, end):`
			`abstract += text[i]`
			`self.content_counter += 1`
			`df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, abstract]`
			`return df`
backendrepo 2023-03-30 15:17:54 +02:00
			`def get_txt_content(self, item, df):`
enhance_janet 2023-04-15 10:52:01 +02:00			`for rsrc in tqdm(item['resources']):`
			`response = requests.get(rsrc['url'])`
			`if 'text/plain' in response.headers.get('content-type'):`
			`content = response.text`
			`content = self.remove_useless_dots(content)`
			`sentences = content.split('.')`
			`paragraphs = []`
			`for i in range(0, len(sentences)-4, 5):`
			`paragraph = sentences[i] + '. ' + sentences[i+1]+ '. ' + sentences[i+2]+ '. ' + sentences[i+3] + '. ' + sentences[i+4]+ '. '`
			`paragraphs.append(paragraph)`
			`for paragraph in tqdm(paragraphs):`
			`self.content_counter += 1`
			`df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]`
			`return df`
backendrepo 2023-03-30 15:17:54 +02:00			`def get_db(self):`
enhance_janet 2023-04-15 10:52:01 +02:00			`return self.db`
backendrepo 2023-03-30 15:17:54 +02:00			`def get_index(self):`
enhance_janet 2023-04-15 10:52:01 +02:00			`return self.index`