enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
7 changed files with 61 additions and 20 deletions
Showing only changes of commit 3ebf81d2b0 - Show all commits

View File

@ -63,6 +63,8 @@ public abstract class AbstractMdRecordToOafMapper {
protected final VocabularyGroup vocs; protected final VocabularyGroup vocs;
private final boolean invisible;
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
protected static final Qualifier ORCID_PID_TYPE = qualifier( protected static final Qualifier ORCID_PID_TYPE = qualifier(
@ -85,8 +87,9 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs) { protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
this.vocs = vocs; this.vocs = vocs;
this.invisible = invisible;
} }
public List<Oaf> processMdRecord(final String xml) { public List<Oaf> processMdRecord(final String xml) {
@ -112,7 +115,7 @@ public abstract class AbstractMdRecordToOafMapper {
return null; return null;
} }
final DataInfo info = prepareDataInfo(doc); final DataInfo info = prepareDataInfo(doc, invisible);
final long lastUpdateTimestamp = new Date().getTime(); final long lastUpdateTimestamp = new Date().getTime();
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
@ -510,11 +513,11 @@ public abstract class AbstractMdRecordToOafMapper {
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
} }
protected DataInfo prepareDataInfo(final Document doc) { protected DataInfo prepareDataInfo(final Document doc, final boolean invisible) {
final Node n = doc.selectSingleNode("//oaf:datainfo"); final Node n = doc.selectSingleNode("//oaf:datainfo");
if (n == null) { if (n == null) {
return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); return dataInfo(false, null, false, invisible, REPOSITORY_PROVENANCE_ACTIONS, "0.9");
} }
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
@ -528,7 +531,7 @@ public abstract class AbstractMdRecordToOafMapper {
final String trust = n.valueOf("./oaf:trust"); final String trust = n.valueOf("./oaf:trust");
return dataInfo( return dataInfo(
deletedbyinference, inferenceprovenance, inferred, false, deletedbyinference, inferenceprovenance, inferred, invisible,
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
} }

View File

@ -137,10 +137,14 @@ public class GenerateEntitiesApplication {
final String type = StringUtils.substringAfter(id, ":"); final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
case "native_oaf": case "oaf-store-cleaned":
return new OafToOafMapper(vocs).processMdRecord(s); return new OafToOafMapper(vocs, false).processMdRecord(s);
case "native_odf": case "odf-store-cleaned":
return new OdfToOafMapper(vocs).processMdRecord(s); return new OdfToOafMapper(vocs, false).processMdRecord(s);
case "oaf-store-intersection":
return new OafToOafMapper(vocs, true).processMdRecord(s);
case "odf-store-intersection":
return new OdfToOafMapper(vocs, true).processMdRecord(s);
case "datasource": case "datasource":
return Arrays.asList(convertFromJson(s, Datasource.class)); return Arrays.asList(convertFromJson(s, Datasource.class));
case "organization": case "organization":

View File

@ -26,8 +26,7 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio
IOUtils IOUtils
.toString( .toString(
MigrateMongoMdstoresApplication.class MigrateMongoMdstoresApplication.class
.getResourceAsStream( .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json")));
"/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
final String mongoBaseUrl = parser.get("mongoBaseUrl"); final String mongoBaseUrl = parser.get("mongoBaseUrl");
@ -60,7 +59,7 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio
final String currentColl = entry.getValue(); final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) { for (final String xml : mdstoreClient.listRecords(currentColl)) {
emit(xml, "native_" + format); emit(xml, String.format("%s-%s-%s", format, layout, interpretation));
} }
} }
} }

View File

@ -37,8 +37,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
public OafToOafMapper(final VocabularyGroup vocs) { public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
super(vocs); super(vocs, invisible);
} }
@Override @Override

View File

@ -44,8 +44,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
public OdfToOafMapper(final VocabularyGroup vocs) { public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
super(vocs); super(vocs, invisible);
} }
@Override @Override

View File

@ -210,6 +210,23 @@
<arg>--mdLayout</arg><arg>store</arg> <arg>--mdLayout</arg><arg>store</arg>
<arg>--mdInterpretation</arg><arg>cleaned</arg> <arg>--mdInterpretation</arg><arg>cleaned</arg>
</java> </java>
<ok to="ImportOAF_invisible"/>
<error to="Kill"/>
</action>
<action name="ImportOAF_invisible">
<java>
<prepare>
<delete path="${contentPath}/oaf_records_invisible"/>
</prepare>
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
<arg>--hdfsPath</arg><arg>${contentPath}/oaf_records_invisible</arg>
<arg>--mongoBaseUrl</arg><arg>${mongoURL}</arg>
<arg>--mongoDb</arg><arg>${mongoDb}</arg>
<arg>--mdFormat</arg><arg>OAF</arg>
<arg>--mdLayout</arg><arg>store</arg>
<arg>--mdInterpretation</arg><arg>intersection</arg>
</java>
<ok to="wait_import"/> <ok to="wait_import"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -237,7 +254,7 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg> <arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims,${contentPath}/oaf_records_invisible</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg> <arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark> </spark>

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.anyString;
@ -55,7 +56,7 @@ public class MappersTest {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(vocs).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
assertEquals(3, list.size()); assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Publication); assertTrue(list.get(0) instanceof Publication);
@ -69,6 +70,7 @@ public class MappersTest {
assertValidId(p.getId()); assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey()); assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
assertFalse(p.getDataInfo().getInvisible());
assertTrue(p.getAuthor().size() > 0); assertTrue(p.getAuthor().size() > 0);
final Optional<Author> author = p final Optional<Author> author = p
@ -134,11 +136,27 @@ public class MappersTest {
// System.out.println(new ObjectMapper().writeValueAsString(r2)); // System.out.println(new ObjectMapper().writeValueAsString(r2));
} }
@Test
void testPublicationInvisible() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, true).processMdRecord(xml);
assertTrue(list.size() > 0);
assertTrue(list.get(0) instanceof Publication);
final Publication p = (Publication) list.get(0);
assertTrue(p.getDataInfo().getInvisible());
}
@Test @Test
void testDataset() throws IOException { void testDataset() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
assertEquals(3, list.size()); assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Dataset); assertTrue(list.get(0) instanceof Dataset);
@ -220,7 +238,7 @@ public class MappersTest {
void testSoftware() throws IOException { void testSoftware() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
assertEquals(1, list.size()); assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Software); assertTrue(list.get(0) instanceof Software);