From 12a0827944f435eec885f851604eeef4d5018f84 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Wed, 22 Mar 2023 14:56:06 +0200 Subject: [PATCH 1/4] Add a dedicated page for enrichment by mining --- sidebars.js | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sidebars.js b/sidebars.js index 3f99733..3feca6c 100644 --- a/sidebars.js +++ b/sidebars.js @@ -100,10 +100,7 @@ const sidebars = { { type: 'category', label: "Enrichment by mining", - link: { - type: 'generated-index', - description: 'The OpenAIRE Graph is enriched using the different Text and Data Mining (TDM) algorithms that are grouped in the following categories.' - }, + link: {type: 'doc', id: 'graph-production-workflow/enrichment-by-mining/enrichment-by-mining'}, items: [ { type: 'doc', id: 'graph-production-workflow/enrichment-by-mining/affiliation_matching' }, { type: 'doc', id: 'graph-production-workflow/enrichment-by-mining/citation_matching' }, From f530e6b7387f4607ed8b1dc7bfabb9ce2feefab5 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Wed, 22 Mar 2023 14:56:06 +0200 Subject: [PATCH 2/4] Add a dedicated page for enrichment by mining --- .../enrichment-by-mining/enrichment-by-mining.md | 8 ++++++++ sidebars.js | 5 +---- 2 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md diff --git a/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md b/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md new file mode 100644 index 0000000..245495e --- /dev/null +++ b/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md @@ -0,0 +1,8 @@ +import DocCardList from '@theme/DocCardList'; + + +# Enrichment by mining + +The OpenAIRE Research Graph is enriched using the different Text and Data Mining (TDM) algorithms that are grouped in the following categories. + + \ No newline at end of file diff --git a/sidebars.js b/sidebars.js index 3f99733..3feca6c 100644 --- a/sidebars.js +++ b/sidebars.js @@ -100,10 +100,7 @@ const sidebars = { { type: 'category', label: "Enrichment by mining", - link: { - type: 'generated-index', - description: 'The OpenAIRE Graph is enriched using the different Text and Data Mining (TDM) algorithms that are grouped in the following categories.' - }, + link: {type: 'doc', id: 'graph-production-workflow/enrichment-by-mining/enrichment-by-mining'}, items: [ { type: 'doc', id: 'graph-production-workflow/enrichment-by-mining/affiliation_matching' }, { type: 'doc', id: 'graph-production-workflow/enrichment-by-mining/citation_matching' }, From a4f15a3f837b47a552308d154ac12ee3c9eb86a0 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 3 Jul 2023 17:57:30 +0300 Subject: [PATCH 3/4] Add the documentation about the PDF Aggregation Service. --- .../enrichment-by-mining/enrichment-by-mining.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md b/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md index 245495e..b30718d 100644 --- a/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md +++ b/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md @@ -3,6 +3,16 @@ import DocCardList from '@theme/DocCardList'; # Enrichment by mining -The OpenAIRE Research Graph is enriched using the different Text and Data Mining (TDM) algorithms that are grouped in the following categories. +**OpenAIRE** collects the full-texts of the publications, in order to apply TDM (Text and Data Mining) algorithms on them and enrich the Graph with inference links. - \ No newline at end of file +The collection of the full-texts is handled by the internal **PDF Aggregation Service**. This service uses the publications’ urls, from the OpenAIRE Graph and state-of-the-art algorithms, to crawl the web and try to locate and download the full-texts of the open access publications, while focusing on the most recent ones. It respects the servers of the repositories and publishers and avoids overloading them. + +The service is orchestrating a distributed execution system, on the cloud, with multiple microservices running in parallel, in order to efficiently process and download a large number of publications. The microservices store the generated report records for the publications, in a database, and the full-texts in an S3 Object Store. + +On the publication-page level, it applies text-mining algorithms to analyze the structure of the page, extract the full-text url and download the file. Additionally, it tracks various performance indicators to optimize the crawling speed, during execution. + +The PDF Aggregation Service is also capable of bulk-importing full-texts from compatible data sources, which increases the collection speed of full-texts. + +The different Text and Data Mining (TDM) algorithms used in the graph-enrichment process are grouped in the following categories. + + \ No newline at end of file From 1c668b7fd81073f4b52e27748d117a56fcc79161 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 3 Jul 2023 18:07:01 +0300 Subject: [PATCH 4/4] Eliminate the "ambiguous unicode character" warning on Gitea. --- .../enrichment-by-mining/enrichment-by-mining.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md b/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md index b30718d..1561699 100644 --- a/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md +++ b/docs/graph-production-workflow/enrichment-by-mining/enrichment-by-mining.md @@ -5,7 +5,7 @@ import DocCardList from '@theme/DocCardList'; **OpenAIRE** collects the full-texts of the publications, in order to apply TDM (Text and Data Mining) algorithms on them and enrich the Graph with inference links. -The collection of the full-texts is handled by the internal **PDF Aggregation Service**. This service uses the publications’ urls, from the OpenAIRE Graph and state-of-the-art algorithms, to crawl the web and try to locate and download the full-texts of the open access publications, while focusing on the most recent ones. It respects the servers of the repositories and publishers and avoids overloading them. +The collection of the full-texts is handled by the internal **PDF Aggregation Service**. This service uses the publications' urls, from the OpenAIRE Graph and state-of-the-art algorithms, to crawl the web and try to locate and download the full-texts of the open access publications, while focusing on the most recent ones. It respects the servers of the repositories and publishers and avoids overloading them. The service is orchestrating a distributed execution system, on the cloud, with multiple microservices running in parallel, in order to efficiently process and download a large number of publications. The microservices store the generated report records for the publications, in a database, and the full-texts in an S3 Object Store.