forked from D-Net/dnet-hadoop
Merge branch 'beta' into preserve_openorg_parent_child_relations
This commit is contained in:
commit
e5a2c596b2
|
@ -22,6 +22,10 @@
|
|||
<id>dnet45-releases</id>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
||||
</repository>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>file://${dhp.site.stage.path}/site/dhp-build/dhp-code-style</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<build>
|
||||
|
@ -43,6 +47,7 @@
|
|||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<dhp.site.stage.path>/tmp/dhp-site</dhp.site.stage.path>
|
||||
</properties>
|
||||
|
||||
</project>
|
|
@ -10,6 +10,9 @@
|
|||
<packaging>pom</packaging>
|
||||
|
||||
<description>This module is a container for the build tools used in dnet-hadoop</description>
|
||||
<properties>
|
||||
<maven.javadoc.skip>true</maven.javadoc.skip>
|
||||
</properties>
|
||||
|
||||
<modules>
|
||||
<module>dhp-code-style</module>
|
||||
|
@ -17,4 +20,12 @@
|
|||
<module>dhp-build-properties-maven-plugin</module>
|
||||
</modules>
|
||||
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>file://${dhp.site.stage.path}/site/dhp-build</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
|
||||
<menu ref="modules" />
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
|
@ -13,6 +13,13 @@
|
|||
<artifactId>dhp-common</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>file://${dhp.site.stage.path}/site/dhp-common</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
|
||||
|
||||
<dependencies>
|
||||
|
|
|
@ -1,9 +1,20 @@
|
|||
##DHP-Aggregation
|
||||
|
||||
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
|
||||
This module defines a set of oozie workflows for
|
||||
|
||||
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
||||
1. the **collection** and **transformation** of metadata records.
|
||||
2. the **integration** of new external information in the result
|
||||
|
||||
|
||||
### Collection and Transformation
|
||||
|
||||
The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
||||
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
|
||||
of each MDStore.
|
||||
|
||||
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
|
||||
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
|
||||
|
||||
### Integration of external information in the result
|
||||
|
||||
The workflows create new entity in the OpenAIRE format (OAF) which aim is to enrich the result already contained in the graph.
|
||||
See integration section for more insight
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
DHP Aggregation - Integration method
|
||||
=====================================
|
||||
|
||||
The integration method can be applied every time new information, which is not aggregated from the repositories
|
||||
nor computed directly by OpenAIRE, should be added to the results of the graph.
|
||||
|
||||
The information integrated so far is:
|
||||
|
||||
1. Article impact measures
|
||||
1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores
|
||||
2. Result Subjects
|
||||
1. Integration of Fields od Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in
|
||||
results subjects.
|
||||
|
||||
|
||||
The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id
|
||||
and the element in the OAF model that should be used to map the information we want to integrate.
|
||||
|
||||
The id is set by using a particular encoding of the given PID
|
||||
|
||||
*unresolved:[pid]:[pidtype]*
|
||||
|
||||
where
|
||||
|
||||
1. *unresolved* is a constant value
|
||||
2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307
|
||||
3. *pidtype* is the persistent id type, e.g. doi
|
||||
|
||||
Such entities are matched against those available in the graph using the result.instance.pid values.
|
||||
|
||||
This mechanism can be used to integrate enrichments produced as associated by a given PID.
|
||||
If a match will be found with one of the results already in the graph that said result will be enriched with the information
|
||||
present in the new OAF.
|
||||
All the objects for which a match is not found are discarded.
|
||||
|
||||
|
|
@ -4,13 +4,13 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm
|
|||
Collection
|
||||
---------
|
||||
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
|
||||
the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
|
||||
the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
|
||||
|
||||
|
||||
Parsing
|
||||
-------
|
||||
The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
|
||||
an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
|
||||
The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
|
||||
an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
|
||||
|
||||
|
||||
Mapping
|
||||
|
@ -50,6 +50,10 @@ The table below describes the mapping from the XML Native to the OAF mapping
|
|||
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|
||||
|FOR ALL AUTHOR | author.rank| sequential number starting from 1|
|
||||
|
||||
#TODO
|
||||
|
||||
Missing item mapped
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -20,7 +20,9 @@
|
|||
<item name="Pubmed" href="pubmed.html"/>
|
||||
<item name="Datacite" href="datacite.html"/>
|
||||
</item>
|
||||
<item name="Release Notes" href="release-notes.html" />
|
||||
<item name="Integration" href="integration.html" collapse="true">
|
||||
|
||||
</item>
|
||||
<item name="General Information" href="about.html"/>
|
||||
|
||||
<item name="JavaDoc" href="apidocs/" />
|
||||
|
|
|
@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
|||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
class UpdateMatcherTest {
|
||||
public class UpdateMatcherTest {
|
||||
|
||||
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
|
|||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
|
||||
class EnrichMissingPublicationDateTest {
|
||||
public class EnrichMissingPublicationDateTest {
|
||||
|
||||
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ import java.util.Arrays;
|
|||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class SubscriptionUtilsTest {
|
||||
public class SubscriptionUtilsTest {
|
||||
|
||||
@Test
|
||||
void testVerifyListSimilar() {
|
||||
|
|
|
@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
|||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
|
||||
class TrustUtilsTest {
|
||||
public class TrustUtilsTest {
|
||||
|
||||
private static final double THRESHOLD = 0.95;
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
#DHP Enrichment
|
|
@ -0,0 +1,26 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
<menu name="Documentation">
|
||||
<item name="Link1 Collapsable" href="about.html" collapse="true">
|
||||
<item name="item1" href="pubmed.html"/>
|
||||
<item name="item2" href="datacite.html"/>
|
||||
</item>
|
||||
</menu>
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
|
@ -15,6 +15,13 @@
|
|||
|
||||
<description>This module is the container for the oozie workflow definitions in dnet-hadoop project</description>
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>file://${dhp.site.stage.path}/site/dhp-workflows</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<modules>
|
||||
<module>dhp-workflow-profiles</module>
|
||||
<module>dhp-aggregation</module>
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
<menu name="APIDocs">
|
||||
<item name="JavaDoc" href="apidocs/" />
|
||||
<item name="ScalaDoc" href="scaladocs/" />
|
||||
</menu>
|
||||
<menu ref="modules" />
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
5
pom.xml
5
pom.xml
|
@ -719,6 +719,10 @@
|
|||
<id>dnet45-releases</id>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
||||
</repository>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>file://${dhp.site.stage.path}/site/</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
<reporting>
|
||||
<plugins>
|
||||
|
@ -734,6 +738,7 @@
|
|||
</reporting>
|
||||
|
||||
<properties>
|
||||
<dhp.site.stage.path>/tmp/dhp-site</dhp.site.stage.path>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||
<maven.compiler.plugin.version>3.6.0</maven.compiler.plugin.version>
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
<menu ref="modules" />
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
Loading…
Reference in New Issue