From d851cae07b084a6ca7cb9eef6e56ebc4046f2611 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Wed, 5 Jul 2023 00:10:21 +0300 Subject: [PATCH] Add page for enrichment by mining --- .../enrichment-by-mining.md | 18 ++++++++++++++++++ versioned_sidebars/version-5.1.3-sidebars.json | 4 ++-- 2 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md diff --git a/versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md b/versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md new file mode 100644 index 0000000..1561699 --- /dev/null +++ b/versioned_docs/version-5.1.3/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md @@ -0,0 +1,18 @@ +import DocCardList from '@theme/DocCardList'; + + +# Enrichment by mining + +**OpenAIRE** collects the full-texts of the publications, in order to apply TDM (Text and Data Mining) algorithms on them and enrich the Graph with inference links. + +The collection of the full-texts is handled by the internal **PDF Aggregation Service**. This service uses the publications' urls, from the OpenAIRE Graph and state-of-the-art algorithms, to crawl the web and try to locate and download the full-texts of the open access publications, while focusing on the most recent ones. It respects the servers of the repositories and publishers and avoids overloading them. + +The service is orchestrating a distributed execution system, on the cloud, with multiple microservices running in parallel, in order to efficiently process and download a large number of publications. The microservices store the generated report records for the publications, in a database, and the full-texts in an S3 Object Store. + +On the publication-page level, it applies text-mining algorithms to analyze the structure of the page, extract the full-text url and download the file. Additionally, it tracks various performance indicators to optimize the crawling speed, during execution. + +The PDF Aggregation Service is also capable of bulk-importing full-texts from compatible data sources, which increases the collection speed of full-texts. + +The different Text and Data Mining (TDM) algorithms used in the graph-enrichment process are grouped in the following categories. + + \ No newline at end of file diff --git a/versioned_sidebars/version-5.1.3-sidebars.json b/versioned_sidebars/version-5.1.3-sidebars.json index 0791e4d..75204f7 100644 --- a/versioned_sidebars/version-5.1.3-sidebars.json +++ b/versioned_sidebars/version-5.1.3-sidebars.json @@ -160,8 +160,8 @@ "type": "category", "label": "Enrichment by mining", "link": { - "type": "generated-index", - "description": "The OpenAIRE Graph is enriched using the different Text and Data Mining (TDM) algorithms that are grouped in the following categories." + "type": "doc", + "id": "graph-production-workflow/enrichment-by-mining/enrichment-by-mining" }, "items": [ {