forked from D-Net/dnet-hadoop
Merge branch 'beta' into preserve_openorg_parent_child_relations
This commit is contained in:
commit
e5a2c596b2
|
@ -22,6 +22,10 @@
|
||||||
<id>dnet45-releases</id>
|
<id>dnet45-releases</id>
|
||||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
||||||
</repository>
|
</repository>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>file://${dhp.site.stage.path}/site/dhp-build/dhp-code-style</url>
|
||||||
|
</site>
|
||||||
</distributionManagement>
|
</distributionManagement>
|
||||||
|
|
||||||
<build>
|
<build>
|
||||||
|
@ -43,6 +47,7 @@
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
<dhp.site.stage.path>/tmp/dhp-site</dhp.site.stage.path>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -10,6 +10,9 @@
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
|
|
||||||
<description>This module is a container for the build tools used in dnet-hadoop</description>
|
<description>This module is a container for the build tools used in dnet-hadoop</description>
|
||||||
|
<properties>
|
||||||
|
<maven.javadoc.skip>true</maven.javadoc.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
<modules>
|
<modules>
|
||||||
<module>dhp-code-style</module>
|
<module>dhp-code-style</module>
|
||||||
|
@ -17,4 +20,12 @@
|
||||||
<module>dhp-build-properties-maven-plugin</module>
|
<module>dhp-build-properties-maven-plugin</module>
|
||||||
</modules>
|
</modules>
|
||||||
|
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>file://${dhp.site.stage.path}/site/dhp-build</url>
|
||||||
|
</site>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||||
|
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||||
|
name="DHP-Aggregation">
|
||||||
|
<skin>
|
||||||
|
<groupId>org.apache.maven.skins</groupId>
|
||||||
|
<artifactId>maven-fluido-skin</artifactId>
|
||||||
|
<version>1.8</version>
|
||||||
|
</skin>
|
||||||
|
<poweredBy>
|
||||||
|
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||||
|
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||||
|
</poweredBy>
|
||||||
|
<body>
|
||||||
|
<links>
|
||||||
|
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||||
|
</links>
|
||||||
|
|
||||||
|
<menu ref="modules" />
|
||||||
|
<menu ref="reports"/>
|
||||||
|
</body>
|
||||||
|
</project>
|
|
@ -13,6 +13,13 @@
|
||||||
<artifactId>dhp-common</artifactId>
|
<artifactId>dhp-common</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>file://${dhp.site.stage.path}/site/dhp-common</url>
|
||||||
|
</site>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
|
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
|
@ -1,9 +1,20 @@
|
||||||
##DHP-Aggregation
|
##DHP-Aggregation
|
||||||
|
|
||||||
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
|
This module defines a set of oozie workflows for
|
||||||
|
|
||||||
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
1. the **collection** and **transformation** of metadata records.
|
||||||
|
2. the **integration** of new external information in the result
|
||||||
|
|
||||||
|
|
||||||
|
### Collection and Transformation
|
||||||
|
|
||||||
|
The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
||||||
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
|
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
|
||||||
of each MDStore.
|
of each MDStore.
|
||||||
|
|
||||||
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
|
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
|
||||||
|
|
||||||
|
### Integration of external information in the result
|
||||||
|
|
||||||
|
The workflows create new entity in the OpenAIRE format (OAF) which aim is to enrich the result already contained in the graph.
|
||||||
|
See integration section for more insight
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
DHP Aggregation - Integration method
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
The integration method can be applied every time new information, which is not aggregated from the repositories
|
||||||
|
nor computed directly by OpenAIRE, should be added to the results of the graph.
|
||||||
|
|
||||||
|
The information integrated so far is:
|
||||||
|
|
||||||
|
1. Article impact measures
|
||||||
|
1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores
|
||||||
|
2. Result Subjects
|
||||||
|
1. Integration of Fields od Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in
|
||||||
|
results subjects.
|
||||||
|
|
||||||
|
|
||||||
|
The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id
|
||||||
|
and the element in the OAF model that should be used to map the information we want to integrate.
|
||||||
|
|
||||||
|
The id is set by using a particular encoding of the given PID
|
||||||
|
|
||||||
|
*unresolved:[pid]:[pidtype]*
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
1. *unresolved* is a constant value
|
||||||
|
2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307
|
||||||
|
3. *pidtype* is the persistent id type, e.g. doi
|
||||||
|
|
||||||
|
Such entities are matched against those available in the graph using the result.instance.pid values.
|
||||||
|
|
||||||
|
This mechanism can be used to integrate enrichments produced as associated by a given PID.
|
||||||
|
If a match will be found with one of the results already in the graph that said result will be enriched with the information
|
||||||
|
present in the new OAF.
|
||||||
|
All the objects for which a match is not found are discarded.
|
||||||
|
|
||||||
|
|
|
@ -4,13 +4,13 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm
|
||||||
Collection
|
Collection
|
||||||
---------
|
---------
|
||||||
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
|
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
|
||||||
the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
|
the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
|
||||||
|
|
||||||
|
|
||||||
Parsing
|
Parsing
|
||||||
-------
|
-------
|
||||||
The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
|
The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
|
||||||
an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
|
an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
|
||||||
|
|
||||||
|
|
||||||
Mapping
|
Mapping
|
||||||
|
@ -50,6 +50,10 @@ The table below describes the mapping from the XML Native to the OAF mapping
|
||||||
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|
||||||
|FOR ALL AUTHOR | author.rank| sequential number starting from 1|
|
|FOR ALL AUTHOR | author.rank| sequential number starting from 1|
|
||||||
|
|
||||||
|
#TODO
|
||||||
|
|
||||||
|
Missing item mapped
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,9 @@
|
||||||
<item name="Pubmed" href="pubmed.html"/>
|
<item name="Pubmed" href="pubmed.html"/>
|
||||||
<item name="Datacite" href="datacite.html"/>
|
<item name="Datacite" href="datacite.html"/>
|
||||||
</item>
|
</item>
|
||||||
<item name="Release Notes" href="release-notes.html" />
|
<item name="Integration" href="integration.html" collapse="true">
|
||||||
|
|
||||||
|
</item>
|
||||||
<item name="General Information" href="about.html"/>
|
<item name="General Information" href="about.html"/>
|
||||||
|
|
||||||
<item name="JavaDoc" href="apidocs/" />
|
<item name="JavaDoc" href="apidocs/" />
|
||||||
|
|
|
@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
class UpdateMatcherTest {
|
public class UpdateMatcherTest {
|
||||||
|
|
||||||
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
|
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
|
|
||||||
class EnrichMissingPublicationDateTest {
|
public class EnrichMissingPublicationDateTest {
|
||||||
|
|
||||||
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
|
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ import java.util.Arrays;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
class SubscriptionUtilsTest {
|
public class SubscriptionUtilsTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testVerifyListSimilar() {
|
void testVerifyListSimilar() {
|
||||||
|
|
|
@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||||
|
|
||||||
class TrustUtilsTest {
|
public class TrustUtilsTest {
|
||||||
|
|
||||||
private static final double THRESHOLD = 0.95;
|
private static final double THRESHOLD = 0.95;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
#DHP Enrichment
|
|
@ -0,0 +1,26 @@
|
||||||
|
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||||
|
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||||
|
name="DHP-Aggregation">
|
||||||
|
<skin>
|
||||||
|
<groupId>org.apache.maven.skins</groupId>
|
||||||
|
<artifactId>maven-fluido-skin</artifactId>
|
||||||
|
<version>1.8</version>
|
||||||
|
</skin>
|
||||||
|
<poweredBy>
|
||||||
|
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||||
|
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||||
|
</poweredBy>
|
||||||
|
<body>
|
||||||
|
<links>
|
||||||
|
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||||
|
</links>
|
||||||
|
<menu name="Documentation">
|
||||||
|
<item name="Link1 Collapsable" href="about.html" collapse="true">
|
||||||
|
<item name="item1" href="pubmed.html"/>
|
||||||
|
<item name="item2" href="datacite.html"/>
|
||||||
|
</item>
|
||||||
|
</menu>
|
||||||
|
<menu ref="reports"/>
|
||||||
|
</body>
|
||||||
|
</project>
|
|
@ -15,6 +15,13 @@
|
||||||
|
|
||||||
<description>This module is the container for the oozie workflow definitions in dnet-hadoop project</description>
|
<description>This module is the container for the oozie workflow definitions in dnet-hadoop project</description>
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>file://${dhp.site.stage.path}/site/dhp-workflows</url>
|
||||||
|
</site>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
<modules>
|
<modules>
|
||||||
<module>dhp-workflow-profiles</module>
|
<module>dhp-workflow-profiles</module>
|
||||||
<module>dhp-aggregation</module>
|
<module>dhp-aggregation</module>
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||||
|
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||||
|
name="DHP-Aggregation">
|
||||||
|
<skin>
|
||||||
|
<groupId>org.apache.maven.skins</groupId>
|
||||||
|
<artifactId>maven-fluido-skin</artifactId>
|
||||||
|
<version>1.8</version>
|
||||||
|
</skin>
|
||||||
|
<poweredBy>
|
||||||
|
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||||
|
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||||
|
</poweredBy>
|
||||||
|
<body>
|
||||||
|
<links>
|
||||||
|
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||||
|
</links>
|
||||||
|
<menu name="APIDocs">
|
||||||
|
<item name="JavaDoc" href="apidocs/" />
|
||||||
|
<item name="ScalaDoc" href="scaladocs/" />
|
||||||
|
</menu>
|
||||||
|
<menu ref="modules" />
|
||||||
|
<menu ref="reports"/>
|
||||||
|
</body>
|
||||||
|
</project>
|
5
pom.xml
5
pom.xml
|
@ -719,6 +719,10 @@
|
||||||
<id>dnet45-releases</id>
|
<id>dnet45-releases</id>
|
||||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
||||||
</repository>
|
</repository>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>file://${dhp.site.stage.path}/site/</url>
|
||||||
|
</site>
|
||||||
</distributionManagement>
|
</distributionManagement>
|
||||||
<reporting>
|
<reporting>
|
||||||
<plugins>
|
<plugins>
|
||||||
|
@ -734,6 +738,7 @@
|
||||||
</reporting>
|
</reporting>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
|
<dhp.site.stage.path>/tmp/dhp-site</dhp.site.stage.path>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
|
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||||
|
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||||
|
name="DHP-Aggregation">
|
||||||
|
<skin>
|
||||||
|
<groupId>org.apache.maven.skins</groupId>
|
||||||
|
<artifactId>maven-fluido-skin</artifactId>
|
||||||
|
<version>1.8</version>
|
||||||
|
</skin>
|
||||||
|
<poweredBy>
|
||||||
|
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||||
|
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||||
|
</poweredBy>
|
||||||
|
<body>
|
||||||
|
<links>
|
||||||
|
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||||
|
</links>
|
||||||
|
<menu ref="modules" />
|
||||||
|
<menu ref="reports"/>
|
||||||
|
</body>
|
||||||
|
</project>
|
Loading…
Reference in New Issue