From d851cae07b084a6ca7cb9eef6e56ebc4046f2611 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Wed, 5 Jul 2023 00:10:21 +0300
Subject: [PATCH] Add page for enrichment by mining

---
 .../enrichment-by-mining.md                    | 18 ++++++++++++++++++
 versioned_sidebars/version-5.1.3-sidebars.json |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md
diff --git a/versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md b/versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md
new file mode 100644
index 0000000..1561699
--- /dev/null
+++ b/versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md
@@ -0,0 +1,18 @@
+import DocCardList from '@theme/DocCardList';
+
+
+# Enrichment by mining
+
+**OpenAIRE** collects the full-texts of the publications, in order to apply TDM (Text and Data Mining) algorithms on them and enrich the Graph with inference links.
+
+The collection of the full-texts is handled by the internal **PDF Aggregation Service**. This service uses the publications' urls, from the OpenAIRE Graph and state-of-the-art algorithms, to crawl the web and try to locate and download the full-texts of the open access publications, while focusing on the most recent ones. It respects the servers of the repositories and publishers and avoids overloading them.
+
+The service is orchestrating a distributed execution system, on the cloud, with multiple microservices running in parallel, in order to efficiently process and download a large number of publications. The microservices store the generated report records for the publications, in a database, and the full-texts in an S3 Object Store.
+
+On the publication-page level, it applies text-mining algorithms to analyze the structure of the page, extract the full-text url and download the file. Additionally, it tracks various performance indicators to optimize the crawling speed, during execution.
+
+The PDF Aggregation Service is also capable of bulk-importing full-texts from compatible data sources, which increases the collection speed of full-texts.
+
+The different Text and Data Mining (TDM) algorithms used in the graph-enrichment process are grouped in the following categories.
+
+<DocCardList></DocCardList>
\ No newline at end of file
diff --git a/versioned_sidebars/version-5.1.3-sidebars.json b/versioned_sidebars/version-5.1.3-sidebars.json
index 0791e4d..75204f7 100644
--- a/versioned_sidebars/version-5.1.3-sidebars.json
+++ b/versioned_sidebars/version-5.1.3-sidebars.json
@@ -160,8 +160,8 @@
           "type": "category",
           "label": "Enrichment by mining",
           "link": {
-            "type": "generated-index",
-            "description": "The OpenAIRE Graph is enriched using the different Text and Data Mining (TDM) algorithms that are grouped in the following categories."
+            "type": "doc",
+            "id": "graph-production-workflow/enrichment-by-mining/enrichment-by-mining"
           },
           "items": [
             {