JanetBackEnd/VRE.py

from datetime import datetime
import pandas as pd
import requests
import os
from io import BytesIO
import PyPDF2
from tqdm.auto import tqdm
import numpy as np
import math
import faiss
import time
import threading

class VRE:
    def __init__(self, name, token, retriever, directory='./'):
        self.name = name
        self.token = token
        self.catalogue_url = 'https://api.d4science.org/catalogue/items/'
        self.headers = headers = {"gcube-token": self.token, "Accept": "application/json"}
        self.lastupdatetime = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()
        self.retriever = retriever
        self.directory = directory
        self.paper_counter = 0
        self.dataset_counter = 0
        self.content_counter = 0
        self.db = {'paper_db': pd.read_json(self.directory + self.name + '_paper.json') if os.path.isfile(self.directory + self.name + '_paper.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),
                   'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),
                   'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content'])}
        self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index('janet_dataset_titles_index'),
                  'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index('janet_paper_titles_index'),
                  'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index('janet_dataset_desc_index'),
                  'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index('janet_paper_desc_index'),
                  'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index('janet_content_index')}
        self.new_income = False

    def init(self):
      #first run
      if not os.path.isfile(self.directory + self.name + '_dataset' + '.json') or not os.path.isfile(self.directory + self.name + '_paper' + '.json') or not os.path.isfile(self.directory + self.name + '_content' + '.json'):
        self.get_content()
      if self.index['dataset_titles_index'] is None:
        self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
        self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')

      if self.index['dataset_desc_index'] is None:
        self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
        self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')

      if self.index['paper_titles_index'] is None:
        self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
        self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')

      if self.index['paper_desc_index'] is None:
        self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
        self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')

      if self.index['content_index'] is None:
        self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
        self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')


    def index_periodic_update(self):
      if self.new_income:
        if len(self.db['content_db'])%100 != 0:
          self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
          self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
        if len(self.db['paper_db'])%100 != 0:
          self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
          self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
          self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
          self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
        if len(self.db['dataset_db'])%100 != 0:
          self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
          self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
          self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
          self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
        self.new_income = False

    def create_index(self, db_type, attribute, index_type, filename):
      to_index = self.db[db_type][attribute]
      for i, info in enumerate(to_index):
        if i == 0:
          emb = self.retriever.encode([info])
          sentence_embeddings = np.array(emb)
        else:
          emb = self.retriever.encode([info])
          sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)

      # number of partitions of the coarse quantizer = number of posting lists
      # as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
      nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
      code_size = 8  # = number of subquantizers = number of sub-vectors
      n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings)))  # n_bits of each code (8 -> 1 byte codes)
      d = sentence_embeddings.shape[1]
      coarse_quantizer = faiss.IndexFlatL2(d)  # will keep centroids of coarse quantizer (for inverted list)
      self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
      self.index[index_type].train(sentence_embeddings)  # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
      faiss.write_index(self.index[index_type], filename)

    def populate_index(self, db_type, attribute, index_type, filename):
      to_index = self.db[db_type][attribute]
      for info in to_index:
        sentence_embedding = np.array(self.retriever.encode([info]))
        self.index[index_type].add(sentence_embedding)
      faiss.write_index(self.index[index_type], filename)

    def get_content(self):
        response = requests.get(self.catalogue_url, headers=self.headers)
        items = response.json()
        items_data = []
        for item in items:
            api_url = self.catalogue_url + item + '/'
            response = requests.get(api_url, headers=self.headers)
            items_data.append(response.json())

        keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']

        paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
        dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
        content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])

        for item in items_data:
            for el in item['extras']:
                if el['key'] == 'system:type':
                    rsrc = el['value']
            resources = []
            for resource in item['resources']:
                resources.append(
                    {'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
            tags = []
            for tag in item['tags']:
                tags.append(tag['name'].lower())
            title = item['title'].lower()
            author = item['author'].lower()
            notes = item['notes'].lower()
            date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
            if date > self.lastupdatetime:
                self.lastupdatetime = date
            if rsrc == 'Paper':
              self.paper_counter += 1
              paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
              content_df = self.get_pdf_content(item, content_df)
              content_df = self.get_txt_content(item, content_df)
            if rsrc == 'Dataset':
               self.dataset_counter += 1
               dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]

        self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
        self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
        self.db['content_db'] = content_df

        self.db['paper_db'].to_json(self.name + '_paper.json')
        self.db['dataset_db'].to_json(self.name + '_dataset.json')

        self.db['content_db'].to_json(self.name + '_content.json')

    # modify query
    def get_vre_update(self):
      print("Getting new items")
      response = requests.get(self.catalogue_url, headers=self.headers)
      items = response.json()
      items_data = []
      for item in items:
          api_url = self.catalogue_url + item + '/'
          response = requests.get(api_url, headers=self.headers)
          if datetime.strptime(response.json()['metadata_created'],'%Y-%m-%dT%H:%M:%S.%f').timestamp() > self.lastupdatetime:
              items_data.append(response.json())

      keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']

      paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
      dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
      content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])

      for item in items_data:
          for el in item['extras']:
              if el['key'] == 'system:type':
                  rsrc = el['value']
          resources = []
          for resource in item['resources']:
              resources.append(
                  {'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
          tags = []
          for tag in item['tags']:
              tags.append(tag['name'].lower())
          title = item['title'].lower()
          author = item['author'].lower()
          notes = item['notes'].lower()
          date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
          if date > self.lastupdatetime:
              self.lastupdatetime = date

          if rsrc == 'Paper':
            self.paper_counter += 1
            paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
            content_df = self.get_pdf_content(item, content_df)
            content_df = self.get_txt_content(item, content_df)
          if rsrc == 'Dataset':
            self.dataset_counter += 1
            dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]

      self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
      self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])

      self.db['paper_db'].to_json(self.name + '_paper.json')
      self.db['dataset_db'].to_json(self.name + '_dataset.json')
      self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
      self.db['content_db'].to_json(self.name + '_content.json')
      if not paper_df.empty or not dataset_df.empty or not content_df.empty:
        self.new_income = True

    def get_pdf_content(self, item, df):
      for rsrc in tqdm(item['resources']):
        response = requests.get(rsrc['url'])
        if 'application/pdf' in response.headers.get('content-type'):
          my_raw_data = response.content
          with BytesIO(my_raw_data) as data:
            read_pdf = PyPDF2.PdfReader(data)
            for page in tqdm(range(len(read_pdf.pages))):
              content = read_pdf.pages[page].extract_text()
              self.content_counter += 1
              df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, content]
      return df

    def get_txt_content(self, item, df):
      for rsrc in tqdm(item['resources']):
        response = requests.get(rsrc['url'])
        if 'text/plain' in response.headers.get('content-type'):
          content = response.text
          self.content_counter += 1
          df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, content]
      return df
    def get_db(self):
      return self.db
    def get_index(self):
      return self.index