towards EOSC datasource profiles #130

Merged
alessia.bardi merged 7 commits from datasource_model_eosc_beta into beta 2021-08-23 11:58:35 +02:00
16 changed files with 770 additions and 728 deletions

View File

@ -1,17 +1,7 @@
package eu.dnetlib.dhp.actionmanager.project; package eu.dnetlib.dhp.actionmanager.project;
import com.fasterxml.jackson.databind.ObjectMapper; import static org.junit.jupiter.api.Assertions.assertTrue;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.*;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
@ -19,7 +9,20 @@ import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.nio.file.Files; import java.nio.file.Files;
import static org.junit.jupiter.api.Assertions.assertTrue; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class DownloadCsvTest { public class DownloadCsvTest {
@ -103,8 +106,8 @@ public class DownloadCsvTest {
GetCSV GetCSV
.getCsv( .getCsv(
fs, fs,
new BufferedReader(new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))) new BufferedReader(new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))),
, workingDir + "/projects", workingDir + "/projects",
CSVProject.class.getName(), ';'); CSVProject.class.getName(), ';');
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/projects")))); BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/projects"))));

View File

@ -33,7 +33,7 @@ public class Process implements Serializable {
ri.setType(Constants.RESEARCH_INFRASTRUCTURE); ri.setType(Constants.RESEARCH_INFRASTRUCTURE);
} }
ri.setId(Utils.getContextId(ci.getId())); ri.setId(Utils.getContextId(ci.getId()));
ri.setOriginalId(ci.getId()); ri.setAcronym(ci.getId());
ri.setDescription(ci.getDescription()); ri.setDescription(ci.getDescription());
ri.setName(ci.getName()); ri.setName(ci.getName());

View File

@ -1,15 +1,51 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_MERGED_IN;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.MERGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.asString;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.sql.Array; import java.sql.Array;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
@ -50,8 +86,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
private static final Logger log = LoggerFactory.getLogger(MigrateDbEntitiesApplication.class); private static final Logger log = LoggerFactory.getLogger(MigrateDbEntitiesApplication.class);
private static final DataInfo DATA_INFO_CLAIM = dataInfo( private static final DataInfo DATA_INFO_CLAIM = dataInfo(
false, null, false, false, false, null, false, false, qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS),
qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9"); "0.9");
private static final List<KeyValue> COLLECTED_FROM_CLAIM = listKeyValues( private static final List<KeyValue> COLLECTED_FROM_CLAIM = listKeyValues(
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
@ -69,10 +105,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(
Objects
.requireNonNull(
MigrateDbEntitiesApplication.class MigrateDbEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json")))); .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
@ -86,7 +120,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
log.info("postgresPassword: xxx"); log.info("postgresPassword: xxx");
final String dbSchema = parser.get("dbschema"); final String dbSchema = parser.get("dbschema");
log.info("dbSchema {}: ", dbSchema); log.info("dbSchema {}: " + dbSchema);
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
@ -139,8 +173,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
case openorgs_dedup: // generates organization entities and relations for openorgs dedup case openorgs_dedup: // generates organization entities and relations for openorgs dedup
log.info("Processing Openorgs..."); log.info("Processing Openorgs...");
smdbe smdbe
.execute( .execute("queryOpenOrgsForOrgsDedup.sql", smdbe::processOrganization, verifyNamespacePrefix);
"queryOpenOrgsForOrgsDedup.sql", smdbe::processOrganization, verifyNamespacePrefix);
log.info("Processing Openorgs Sim Rels..."); log.info("Processing Openorgs Sim Rels...");
smdbe.execute("queryOpenOrgsSimilarityForOrgsDedup.sql", smdbe::processOrgOrgSimRels); smdbe.execute("queryOpenOrgsSimilarityForOrgsDedup.sql", smdbe::processOrgOrgSimRels);
@ -149,8 +182,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
case openorgs: // generates organization entities and relations for provision case openorgs: // generates organization entities and relations for provision
log.info("Processing Openorgs For Provision..."); log.info("Processing Openorgs For Provision...");
smdbe smdbe
.execute( .execute("queryOpenOrgsForProvision.sql", smdbe::processOrganization, verifyNamespacePrefix);
"queryOpenOrgsForProvision.sql", smdbe::processOrganization, verifyNamespacePrefix);
log.info("Processing Openorgs Merge Rels..."); log.info("Processing Openorgs Merge Rels...");
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgMergeRels); smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgMergeRels);
@ -228,6 +260,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
ds.setOaiprovenance(null); // Values not present in the DB ds.setOaiprovenance(null); // Values not present in the DB
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
ds.setDatasourcetypeui(prepareQualifierSplitting(rs.getString("datasourcetypeui")));
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
ds.setOfficialname(field(rs.getString("officialname"), info)); ds.setOfficialname(field(rs.getString("officialname"), info));
ds.setEnglishname(field(rs.getString("englishname"), info)); ds.setEnglishname(field(rs.getString("englishname"), info));
@ -269,6 +302,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setDataInfo(info); ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp); ds.setLastupdatetimestamp(lastUpdateTimestamp);
ds.setJurisdiction(prepareQualifierSplitting(rs.getString("jurisdiction")));
ds.setThematic(rs.getBoolean("thematic"));
ds.setKnowledgegraph(rs.getBoolean("knowledgegraph"));
ds.setContentpolicies(prepareListOfQualifiers(rs.getArray("contentpolicies")));
return Arrays.asList(ds); return Arrays.asList(ds);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
@ -494,8 +532,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
throw new IllegalStateException( throw new IllegalStateException(
String String
.format( .format(
"invalid claim, sourceId: %s, targetId: %s, semantics: %s", "invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
sourceId, targetId, semantics)); semantics));
} }
r1 = setRelationSemantic(r1, RESULT_PROJECT, OUTCOME, PRODUCES); r1 = setRelationSemantic(r1, RESULT_PROJECT, OUTCOME, PRODUCES);
r2 = setRelationSemantic(r2, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY); r2 = setRelationSemantic(r2, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY);
@ -515,8 +553,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
} }
} }
private Relation prepareRelation(String sourceId, String targetId, String validationDate) { private Relation prepareRelation(final String sourceId, final String targetId, final String validationDate) {
Relation r = new Relation(); final Relation r = new Relation();
if (StringUtils.isNotBlank(validationDate)) { if (StringUtils.isNotBlank(validationDate)) {
r.setValidated(true); r.setValidated(true);
r.setValidationDate(validationDate); r.setValidationDate(validationDate);
@ -529,7 +567,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return r; return r;
} }
private Relation setRelationSemantic(Relation r, String relType, String subRelType, String relClass) { private Relation setRelationSemantic(final Relation r, final String relType, final String subRelType,
final String relClass) {
r.setRelType(relType); r.setRelType(relType);
r.setSubRelType(subRelType); r.setSubRelType(subRelType);
r.setRelClass(relClass); r.setRelClass(relClass);
@ -602,6 +641,19 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return res; return res;
} }
private List<Qualifier> prepareListOfQualifiers(final Array array) throws SQLException {
final List<Qualifier> res = new ArrayList<>();
if (array != null) {
for (final String s : (String[]) array.getArray()) {
final Qualifier q = prepareQualifierSplitting(s);
if (q != null) {
res.add(q);
}
}
}
return res;
}
public List<Oaf> processOrgOrgMergeRels(final ResultSet rs) { public List<Oaf> processOrgOrgMergeRels(final ResultSet rs) {
try { try {
final DataInfo info = prepareDataInfo(rs); // TODO final DataInfo info = prepareDataInfo(rs); // TODO
@ -658,6 +710,18 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r1.setDataInfo(info); r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp); r1.setLastupdatetimestamp(lastUpdateTimestamp);
// removed because there's no difference between two sides //TODO
// final Relation r2 = new Relation();
// r2.setRelType(ORG_ORG_RELTYPE);
// r2.setSubRelType(ORG_ORG_SUBRELTYPE);
// r2.setRelClass(relClass);
// r2.setSource(orgId2);
// r2.setTarget(orgId1);
// r2.setCollectedfrom(collectedFrom);
// r2.setDataInfo(info);
// r2.setLastupdatetimestamp(lastUpdateTimestamp);
// return Arrays.asList(r1, r2);
return Arrays.asList(r1); return Arrays.asList(r1);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);

View File

@ -84,13 +84,18 @@ SELECT
dc.id AS collectedfromid, dc.id AS collectedfromid,
dc.officialname AS collectedfromname, dc.officialname AS collectedfromname,
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype, d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
d.typology||'@@@dnet:datasource_typologies_ui' AS datasourcetypeui,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction, 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
d.issn AS issnPrinted, d.issn AS issnPrinted,
d.eissn AS issnOnline, d.eissn AS issnOnline,
d.lissn AS issnLinking d.lissn AS issnLinking,
de.jurisdiction||'@@@eosc:jurisdictions' AS jurisdiction,
de.thematic AS thematic,
de.knowledge_graph AS knowledgegraph,
array(select unnest(de.content_policies)||'@@@eosc:contentpolicies') AS contentpolicies
FROM dsm_datasources d FROM dsm_datasources d
LEFT OUTER JOIN dsm_datasources_eosc de on (d.id = de.id)
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id) LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource) LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource) LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
@ -126,4 +131,8 @@ GROUP BY
dc.officialname, dc.officialname,
d.issn, d.issn,
d.eissn, d.eissn,
d.lissn d.lissn,
de.jurisdiction,
de.thematic,
de.knowledge_graph,
de.content_policies

View File

@ -11,7 +11,6 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -21,6 +20,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
@ -760,7 +760,6 @@ class MappersTest {
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid()); assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
} }
@Test @Test
void testXMLEncodedURL() throws IOException, DocumentException { void testXMLEncodedURL() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml")));
@ -779,7 +778,8 @@ class MappersTest {
@Test @Test
void testXMLEncodedURL_ODF() throws IOException, DocumentException { void testXMLEncodedURL_ODF() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml"))); final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");

View File

@ -28,11 +28,16 @@ import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
class MigrateDbEntitiesApplicationTest { public class MigrateDbEntitiesApplicationTest {
private MigrateDbEntitiesApplication app; private MigrateDbEntitiesApplication app;
@ -46,11 +51,8 @@ class MigrateDbEntitiesApplicationTest {
public void setUp() { public void setUp() {
lenient() lenient()
.when(vocs.getTermAsQualifier(anyString(), anyString())) .when(vocs.getTermAsQualifier(anyString(), anyString()))
.thenAnswer( .thenAnswer(invocation -> OafMapperUtils
invocation -> OafMapperUtils .qualifier(invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0), invocation.getArgument(0)));
.qualifier(
invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
invocation.getArgument(0)));
lenient().when(vocs.termExists(anyString(), anyString())).thenReturn(true); lenient().when(vocs.termExists(anyString(), anyString())).thenReturn(true);
@ -58,7 +60,7 @@ class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
void testProcessDatasource() throws Exception { public void testProcessDatasource() throws Exception {
final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json"); final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json");
final List<Oaf> list = app.processDatasource(rs); final List<Oaf> list = app.processDatasource(rs);
@ -78,10 +80,27 @@ class MigrateDbEntitiesApplicationTest {
assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted()); assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline()); assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking()); assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking());
assertEquals("pubsrepository::journal", ds.getDatasourcetype().getClassid());
assertEquals("dnet:datasource_typologies", ds.getDatasourcetype().getSchemeid());
assertEquals("pubsrepository::journal", ds.getDatasourcetypeui().getClassid());
assertEquals("dnet:datasource_typologies_ui", ds.getDatasourcetypeui().getSchemeid());
assertEquals("National", ds.getJurisdiction().getClassid());
assertEquals("eosc:jurisdictions", ds.getJurisdiction().getSchemeid());
assertTrue(ds.getThematic());
assertTrue(ds.getKnowledgegraph());
assertEquals(1, ds.getContentpolicies().size());
assertEquals("Journal article", ds.getContentpolicies().get(0).getClassid());
assertEquals("eosc:contentpolicies", ds.getContentpolicies().get(0).getSchemeid());
} }
@Test @Test
void testProcessProject() throws Exception { public void testProcessProject() throws Exception {
final List<TypedField> fields = prepareMocks("projects_resultset_entry.json"); final List<TypedField> fields = prepareMocks("projects_resultset_entry.json");
final List<Oaf> list = app.processProject(rs); final List<Oaf> list = app.processProject(rs);
@ -99,7 +118,7 @@ class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
void testProcessOrganization() throws Exception { public void testProcessOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json"); final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json");
final List<Oaf> list = app.processOrganization(rs); final List<Oaf> list = app.processOrganization(rs);
@ -119,14 +138,14 @@ class MigrateDbEntitiesApplicationTest {
assertEquals(getValueAsString("country", fields).split("@@@")[1], o.getCountry().getSchemeid()); assertEquals(getValueAsString("country", fields).split("@@@")[1], o.getCountry().getSchemeid());
assertEquals(getValueAsString("country", fields).split("@@@")[1], o.getCountry().getSchemename()); assertEquals(getValueAsString("country", fields).split("@@@")[1], o.getCountry().getSchemename());
assertEquals(getValueAsString("collectedfromname", fields), o.getCollectedfrom().get(0).getValue()); assertEquals(getValueAsString("collectedfromname", fields), o.getCollectedfrom().get(0).getValue());
List<String> alternativenames = getValueAsList("alternativenames", fields); final List<String> alternativenames = getValueAsList("alternativenames", fields);
assertEquals(2, alternativenames.size()); assertEquals(2, alternativenames.size());
assertTrue(alternativenames.contains("Pippo")); assertTrue(alternativenames.contains("Pippo"));
assertTrue(alternativenames.contains("Foo")); assertTrue(alternativenames.contains("Foo"));
} }
@Test @Test
void testProcessDatasourceOrganization() throws Exception { public void testProcessDatasourceOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json"); final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
final List<Oaf> list = app.processDatasourceOrganization(rs); final List<Oaf> list = app.processDatasourceOrganization(rs);
@ -143,7 +162,7 @@ class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
void testProcessProjectOrganization() throws Exception { public void testProcessProjectOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json"); final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json");
final List<Oaf> list = app.processProjectOrganization(rs); final List<Oaf> list = app.processProjectOrganization(rs);
@ -162,7 +181,7 @@ class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
void testProcessClaims_context() throws Exception { public void testProcessClaims_context() throws Exception {
final List<TypedField> fields = prepareMocks("claimscontext_resultset_entry.json"); final List<TypedField> fields = prepareMocks("claimscontext_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs); final List<Oaf> list = app.processClaims(rs);
@ -177,7 +196,7 @@ class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
void testProcessClaims_rels() throws Exception { public void testProcessClaims_rels() throws Exception {
final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json"); final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs); final List<Oaf> list = app.processClaims(rs);
@ -208,13 +227,15 @@ class MigrateDbEntitiesApplicationTest {
assertValidId(r1.getCollectedfrom().get(0).getKey()); assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey()); assertValidId(r2.getCollectedfrom().get(0).getKey());
// System.out.println(new ObjectMapper().writeValueAsString(r1));
// System.out.println(new ObjectMapper().writeValueAsString(r2));
} }
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException { private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
final ObjectMapper mapper = new ObjectMapper(); final ObjectMapper mapper = new ObjectMapper();
final List<TypedField> list = mapper.readValue(json, new TypeReference<List<TypedField>>() { final List<TypedField> list = mapper.readValue(json, new TypeReference<List<TypedField>>() {});
});
for (final TypedField tf : list) { for (final TypedField tf : list) {
if (tf.getValue() == null) { if (tf.getValue() == null) {
@ -270,7 +291,7 @@ class MigrateDbEntitiesApplicationTest {
final String[] values = ((List<?>) tf.getValue()) final String[] values = ((List<?>) tf.getValue())
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(Object::toString) .map(o -> o.toString())
.toArray(String[]::new); .toArray(String[]::new);
Mockito.when(arr.getArray()).thenReturn(values); Mockito.when(arr.getArray()).thenReturn(values);
@ -331,7 +352,6 @@ class MigrateDbEntitiesApplicationTest {
return new Float(getValueAs(name, fields).toString()); return new Float(getValueAs(name, fields).toString());
} }
@SuppressWarnings("unchecked")
private <T> T getValueAs(final String name, final List<TypedField> fields) { private <T> T getValueAs(final String name, final List<TypedField> fields) {
return fields return fields
.stream() .stream()

View File

@ -222,6 +222,11 @@
"type": "string", "type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies" "value": "pubsrepository::journal@@@dnet:datasource_typologies"
}, },
{
"field": "datasourcetypeui",
"type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies_ui"
},
{ {
"field": "provenanceaction", "field": "provenanceaction",
"type": "not_used", "type": "not_used",
@ -241,5 +246,27 @@
"field": "issnLinking", "field": "issnLinking",
"type": "string", "type": "string",
"value": "2579-5447" "value": "2579-5447"
},
{
"field": "jurisdiction",
"type": "string",
"value": "National@@@eosc:jurisdictions"
},
{
"field": "thematic",
"type": "boolean",
"value": true
},
{
"field": "knowledgegraph",
"type": "boolean",
"value": true
},
{
"field": "contentpolicies",
"type": "array",
"value": [
"Journal article@@@eosc:contentpolicies"
]
} }
] ]

View File

@ -27,14 +27,20 @@ import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import scala.Tuple2; import scala.Tuple2;
/** /**
* CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type * CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type E_i map E_i as RelatedEntity
* E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join * T_i to simplify the model and extracting only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i)
* (R.target = T_i.id) save the tuples (R_i, T_i)
*/ */
public class CreateRelatedEntitiesJob_phase1 { public class CreateRelatedEntitiesJob_phase1 {
@ -42,71 +48,65 @@ public class CreateRelatedEntitiesJob_phase1 {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(final String[] args) throws Exception {
String jsonConfiguration = IOUtils final String jsonConfiguration = IOUtils
.toString( .toString(
Objects PrepareRelationsJob.class
.requireNonNull( .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json"));
CreateRelatedEntitiesJob_phase1.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputRelationsPath = parser.get("inputRelationsPath"); final String inputRelationsPath = parser.get("inputRelationsPath");
log.info("inputRelationsPath: {}", inputRelationsPath); log.info("inputRelationsPath: {}", inputRelationsPath);
String inputEntityPath = parser.get("inputEntityPath"); final String inputEntityPath = parser.get("inputEntityPath");
log.info("inputEntityPath: {}", inputEntityPath); log.info("inputEntityPath: {}", inputEntityPath);
String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
String graphTableClassName = parser.get("graphTableClassName"); final String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName); log.info("graphTableClassName: {}", graphTableClassName);
@SuppressWarnings("unchecked") final Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); conf.registerKryoClasses(ProvisionModelSupport.getModelClasses());
runWithSparkSession( runWithSparkSession(conf, isSparkSessionManaged, spark -> {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath); joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath);
}); });
} }
private static <E extends OafEntity> void joinRelationEntity( private static <E extends OafEntity> void joinRelationEntity(
SparkSession spark, final SparkSession spark,
String inputRelationsPath, final String inputRelationsPath,
String inputEntityPath, final String inputEntityPath,
Class<E> clazz, final Class<E> clazz,
String outputPath) { final String outputPath) {
Dataset<Tuple2<String, Relation>> relsByTarget = readPathRelation(spark, inputRelationsPath) final Dataset<Tuple2<String, Relation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
.map( .map(
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getTarget(), (MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getTarget(),
r), r),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))) Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class)))
.cache(); .cache();
Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, clazz) final Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, clazz)
.filter("dataInfo.invisible == false") .filter("dataInfo.invisible == false")
.map( .map(
(MapFunction<E, Tuple2<String, RelatedEntity>>) e -> new Tuple2<>(e.getId(), asRelatedEntity(e, clazz)), (MapFunction<E, Tuple2<String, RelatedEntity>>) e -> new Tuple2<>(e.getId(), asRelatedEntity(e, clazz)),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) Encoders
.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class)))
.cache(); .cache();
relsByTarget relsByTarget
@ -121,7 +121,9 @@ public class CreateRelatedEntitiesJob_phase1 {
} }
private static <E extends OafEntity> Dataset<E> readPathEntity( private static <E extends OafEntity> Dataset<E> readPathEntity(
SparkSession spark, String inputEntityPath, Class<E> entityClazz) { final SparkSession spark,
final String inputEntityPath,
final Class<E> entityClazz) {
log.info("Reading Graph table from: {}", inputEntityPath); log.info("Reading Graph table from: {}", inputEntityPath);
return spark return spark
@ -132,7 +134,7 @@ public class CreateRelatedEntitiesJob_phase1 {
Encoders.bean(entityClazz)); Encoders.bean(entityClazz));
} }
public static <E extends OafEntity> RelatedEntity asRelatedEntity(E entity, Class<E> clazz) { public static <E extends OafEntity> RelatedEntity asRelatedEntity(final E entity, final Class<E> clazz) {
final RelatedEntity re = new RelatedEntity(); final RelatedEntity re = new RelatedEntity();
re.setId(entity.getId()); re.setId(entity.getId());
@ -146,14 +148,10 @@ public class CreateRelatedEntitiesJob_phase1 {
case dataset: case dataset:
case otherresearchproduct: case otherresearchproduct:
case software: case software:
Result result = (Result) entity; final Result result = (Result) entity;
if (result.getTitle() != null && !result.getTitle().isEmpty()) { if (result.getTitle() != null && !result.getTitle().isEmpty()) {
final StructuredProperty title = result final StructuredProperty title = result.getTitle().stream().findFirst().get();
.getTitle()
.stream()
.findFirst()
.orElseThrow(() -> new IllegalStateException("missing title in " + entity.getId()));
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH)); title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
re.setTitle(title); re.setTitle(title);
} }
@ -177,16 +175,17 @@ public class CreateRelatedEntitiesJob_phase1 {
break; break;
case datasource: case datasource:
Datasource d = (Datasource) entity; final Datasource d = (Datasource) entity;
re.setOfficialname(getValue(d.getOfficialname())); re.setOfficialname(getValue(d.getOfficialname()));
re.setWebsiteurl(getValue(d.getWebsiteurl())); re.setWebsiteurl(getValue(d.getWebsiteurl()));
re.setDatasourcetype(d.getDatasourcetype()); re.setDatasourcetype(d.getDatasourcetype());
re.setDatasourcetypeui(d.getDatasourcetypeui());
re.setOpenairecompatibility(d.getOpenairecompatibility()); re.setOpenairecompatibility(d.getOpenairecompatibility());
break; break;
case organization: case organization:
Organization o = (Organization) entity; final Organization o = (Organization) entity;
re.setLegalname(getValue(o.getLegalname())); re.setLegalname(getValue(o.getLegalname()));
re.setLegalshortname(getValue(o.getLegalshortname())); re.setLegalshortname(getValue(o.getLegalshortname()));
@ -194,50 +193,50 @@ public class CreateRelatedEntitiesJob_phase1 {
re.setWebsiteurl(getValue(o.getWebsiteurl())); re.setWebsiteurl(getValue(o.getWebsiteurl()));
break; break;
case project: case project:
Project p = (Project) entity; final Project p = (Project) entity;
re.setProjectTitle(getValue(p.getTitle())); re.setProjectTitle(getValue(p.getTitle()));
re.setCode(getValue(p.getCode())); re.setCode(getValue(p.getCode()));
re.setAcronym(getValue(p.getAcronym())); re.setAcronym(getValue(p.getAcronym()));
re.setContracttype(p.getContracttype()); re.setContracttype(p.getContracttype());
List<Field<String>> f = p.getFundingtree(); final List<Field<String>> f = p.getFundingtree();
if (!f.isEmpty()) { if (!f.isEmpty()) {
re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList())); re.setFundingtree(f.stream().map(s -> s.getValue()).collect(Collectors.toList()));
} }
break; break;
} }
return re; return re;
} }
private static String getValue(Field<String> field) { private static String getValue(final Field<String> field) {
return getFieldValueWithDefault(field, ""); return getFieldValueWithDefault(field, "");
} }
private static <T> T getFieldValueWithDefault(Field<T> f, T defaultValue) { private static <T> T getFieldValueWithDefault(final Field<T> f, final T defaultValue) {
return Optional return Optional
.ofNullable(f) .ofNullable(f)
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(Field::getValue) .map(x -> x.getValue())
.orElse(defaultValue); .orElse(defaultValue);
} }
/** /**
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file,
* file
* *
* @param spark the SparkSession * @param spark
* @param relationPath the path storing the relation objects * @param relationPath
* @return the Dataset<SortableRelation> containing all the relationships * @return the Dataset<SortableRelation> containing all the relationships
*/ */
private static Dataset<Relation> readPathRelation( private static Dataset<Relation> readPathRelation(
SparkSession spark, final String relationPath) { final SparkSession spark,
final String relationPath) {
log.info("Reading relations from: {}", relationPath); log.info("Reading relations from: {}", relationPath);
return spark.read().load(relationPath).as(Encoders.bean(Relation.class)); return spark.read().load(relationPath).as(Encoders.bean(Relation.class));
} }
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }
} }

View File

@ -39,63 +39,53 @@ public class XmlConverterJob {
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
public static final String SCHEMA_LOCATION = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
public static void main(String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(
XmlConverterJob.class XmlConverterJob.class
.getResourceAsStream( .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
"/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath"); final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
String otherDsTypeId = parser.get("otherDsTypeId"); final SparkConf conf = new SparkConf();
log.info("otherDsTypeId: {}", otherDsTypeId);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ProvisionModelSupport.getModelClasses()); conf.registerKryoClasses(ProvisionModelSupport.getModelClasses());
runWithSparkSession( runWithSparkSession(conf, isSparkSessionManaged, spark -> {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
convertToXml( convertToXml(spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl));
spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId);
}); });
} }
private static void convertToXml( private static void convertToXml(
SparkSession spark, final SparkSession spark,
String inputPath, final String inputPath,
String outputPath, final String outputPath,
ContextMapper contextMapper, final ContextMapper contextMapper) {
String otherDsTypeId) {
final XmlRecordFactory recordFactory = new XmlRecordFactory( final XmlRecordFactory recordFactory = new XmlRecordFactory(
prepareAccumulators(spark.sparkContext()), prepareAccumulators(spark.sparkContext()),
contextMapper, contextMapper,
false, false,
SCHEMA_LOCATION, schemaLocation);
otherDsTypeId);
final List<String> paths = HdfsSupport final List<String> paths = HdfsSupport
.listFiles(inputPath, spark.sparkContext().hadoopConfiguration()); .listFiles(inputPath, spark.sparkContext().hadoopConfiguration());
@ -115,16 +105,15 @@ public class XmlConverterJob {
.mapToPair( .mapToPair(
(PairFunction<Tuple2<String, String>, Text, Text>) t -> new Tuple2<>(new Text(t._1()), (PairFunction<Tuple2<String, String>, Text, Text>) t -> new Tuple2<>(new Text(t._1()),
new Text(t._2()))) new Text(t._2())))
.saveAsHadoopFile( .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
} }
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }
private static Map<String, LongAccumulator> prepareAccumulators(SparkContext sc) { private static Map<String, LongAccumulator> prepareAccumulators(final SparkContext sc) {
Map<String, LongAccumulator> accumulators = Maps.newHashMap(); final Map<String, LongAccumulator> accumulators = Maps.newHashMap();
accumulators accumulators
.put( .put(
"resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_isAmongTopNSimilarDocuments",
@ -135,15 +124,13 @@ public class XmlConverterJob {
sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
accumulators accumulators
.put( .put(
"resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
sc.longAccumulator("resultResult_supplement_isSupplementTo"));
accumulators accumulators
.put( .put(
"resultResult_supplement_isSupplementedBy", "resultResult_supplement_isSupplementedBy",
sc.longAccumulator("resultResult_supplement_isSupplementedBy")); sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
accumulators accumulators
.put( .put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
"resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
accumulators accumulators
@ -151,16 +138,11 @@ public class XmlConverterJob {
"resultResult_publicationDataset_isRelatedTo", "resultResult_publicationDataset_isRelatedTo",
sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
accumulators accumulators
.put( .put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
"resultResult_relationship_isRelatedTo",
sc.longAccumulator("resultResult_relationship_isRelatedTo"));
accumulators accumulators
.put( .put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
"resultProject_outcome_isProducedBy",
sc.longAccumulator("resultProject_outcome_isProducedBy"));
accumulators accumulators
.put( .put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
"resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
accumulators accumulators
.put( .put(
"resultOrganization_affiliation_isAuthorInstitutionOf", "resultOrganization_affiliation_isAuthorInstitutionOf",
@ -183,9 +165,7 @@ public class XmlConverterJob {
"organizationOrganization_dedup_isMergedIn", "organizationOrganization_dedup_isMergedIn",
sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
accumulators accumulators
.put( .put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
"organizationOrganization_dedup_merges",
sc.longAccumulator("organizationOrganization_dedup_merges"));
accumulators accumulators
.put( .put(
"datasourceOrganization_provision_isProvidedBy", "datasourceOrganization_provision_isProvidedBy",

View File

@ -16,11 +16,5 @@
"paramLongName": "isLookupUrl", "paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service", "paramDescription": "URL of the isLookUp Service",
"paramRequired": true "paramRequired": true
},
{
"paramName": "odt",
"paramLongName": "otherDsTypeId",
"paramDescription": "list of datasource types to populate field datasourcetypeui",
"paramRequired": true
} }
] ]

View File

@ -25,10 +25,6 @@
<name>targetMaxRelations</name> <name>targetMaxRelations</name>
<description>maximum number of relations allowed for a each entity grouping by target</description> <description>maximum number of relations allowed for a each entity grouping by target</description>
</property> </property>
<property>
<name>otherDsTypeId</name>
<description>mapping used to populate datasourceTypeUi field</description>
</property>
<property> <property>
<name>format</name> <name>format</name>
<description>metadata format name (DMF|TMF)</description> <description>metadata format name (DMF|TMF)</description>
@ -582,7 +578,6 @@
<arg>--inputPath</arg><arg>${workingDir}/join_entities</arg> <arg>--inputPath</arg><arg>${workingDir}/join_entities</arg>
<arg>--outputPath</arg><arg>${workingDir}/xml</arg> <arg>--outputPath</arg><arg>${workingDir}/xml</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--otherDsTypeId</arg><arg>${otherDsTypeId}</arg>
</spark> </spark>
<ok to="should_index"/> <ok to="should_index"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -4,7 +4,6 @@ package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException; import java.io.IOException;
import java.util.Objects;
import javax.xml.transform.Transformer; import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerException;
@ -33,7 +32,7 @@ import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
* *
* The input is a JoinedEntity, i.e. a json representation of an OpenAIRE entity that embeds all the linked entities. * The input is a JoinedEntity, i.e. a json representation of an OpenAIRE entity that embeds all the linked entities.
*/ */
class IndexRecordTransformerTest { public class IndexRecordTransformerTest {
public static final String VERSION = "2021-04-15T10:05:53Z"; public static final String VERSION = "2021-04-15T10:05:53Z";
public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl"; public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl";
@ -46,23 +45,23 @@ class IndexRecordTransformerTest {
} }
@Test @Test
void testPreBuiltRecordTransformation() throws IOException, TransformerException { public void testPreBuiltRecordTransformation() throws IOException, TransformerException {
String record = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("record.xml"))); final String record = IOUtils.toString(getClass().getResourceAsStream("record.xml"));
testRecordTransformation(record); testRecordTransformation(record);
} }
@Test @Test
void testPublicationRecordTransformation() throws IOException, TransformerException { public void testPublicationRecordTransformation() throws IOException, TransformerException {
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlRecordFactoryTest.otherDsTypeId); XmlConverterJob.schemaLocation);
Publication p = load("publication.json", Publication.class); final Publication p = load("publication.json", Publication.class);
Project pj = load("project.json", Project.class); final Project pj = load("project.json", Project.class);
Relation rel = load("relToValidatedProject.json", Relation.class); final Relation rel = load("relToValidatedProject.json", Relation.class);
JoinedEntity<Publication> je = new JoinedEntity<>(p); final JoinedEntity je = new JoinedEntity<>(p);
je je
.setLinks( .setLinks(
Lists Lists
@ -70,25 +69,25 @@ class IndexRecordTransformerTest {
new RelatedEntityWrapper(rel, new RelatedEntityWrapper(rel,
CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class)))); CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class))));
String record = xmlRecordFactory.build(je); final String record = xmlRecordFactory.build(je);
assertNotNull(record); assertNotNull(record);
testRecordTransformation(record); testRecordTransformation(record);
} }
private void testRecordTransformation(String record) throws IOException, TransformerException { private void testRecordTransformation(final String record) throws IOException, TransformerException {
String fields = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("fields.xml"))); final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
String xslt = IOUtils final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
.toString(Objects.requireNonNull(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt); final String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt);
Transformer tr = SaxonTransformerFactory.newInstance(transformer); final Transformer tr = SaxonTransformerFactory.newInstance(transformer);
String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record); final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record);
SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID).parseDocument(indexRecordXML); final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID)
.parseDocument(indexRecordXML);
final String xmlDoc = ClientUtils.toXML(solrDoc); final String xmlDoc = ClientUtils.toXML(solrDoc);
@ -96,9 +95,9 @@ class IndexRecordTransformerTest {
System.out.println(xmlDoc); System.out.println(xmlDoc);
} }
private <T> T load(String fileName, Class<T> clazz) throws IOException { private <T> T load(final String fileName, final Class<T> clazz) throws IOException {
return XmlRecordFactoryTest.OBJECT_MAPPER return XmlRecordFactoryTest.OBJECT_MAPPER
.readValue(IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(fileName))), clazz); .readValue(IOUtils.toString(getClass().getResourceAsStream(fileName)), clazz);
} }
} }

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
@ -13,7 +14,6 @@ import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -24,34 +24,31 @@ import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
class XmlRecordFactoryTest { public class XmlRecordFactoryTest {
public static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource";
public static ObjectMapper OBJECT_MAPPER = new ObjectMapper() public static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test @Test
void testXMLRecordFactory() throws IOException, DocumentException { public void testXMLRecordFactory() throws IOException, DocumentException {
ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
otherDsTypeId); XmlConverterJob.schemaLocation);
Publication p = OBJECT_MAPPER final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
String xml = xmlRecordFactory.build(new JoinedEntity<>(p)); final String xml = xmlRecordFactory.build(new JoinedEntity<>(p));
assertNotNull(xml); assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml)); final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc); assertNotNull(doc);
@ -72,93 +69,64 @@ class XmlRecordFactoryTest {
} }
@Test @Test
void testXMLRecordFactoryWithValidatedProject() throws IOException, DocumentException { public void testXMLRecordFactoryWithValidatedProject() throws IOException, DocumentException {
ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
otherDsTypeId); XmlConverterJob.schemaLocation);
Publication p = OBJECT_MAPPER final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
Project pj = OBJECT_MAPPER final Project pj = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class);
Relation rel = OBJECT_MAPPER final Relation rel = OBJECT_MAPPER
.readValue( .readValue(IOUtils.toString(getClass().getResourceAsStream("relToValidatedProject.json")), Relation.class);
(IOUtils.toString(getClass().getResourceAsStream("relToValidatedProject.json"))), Relation.class); final RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class);
RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class); final List<RelatedEntityWrapper> links = Lists.newArrayList();
List<RelatedEntityWrapper> links = Lists.newArrayList(); final RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject);
RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject);
links.add(rew); links.add(rew);
JoinedEntity je = new JoinedEntity<>(p); final JoinedEntity je = new JoinedEntity<>(p);
je.setLinks(links); je.setLinks(links);
String xml = xmlRecordFactory.build(je); final String xml = xmlRecordFactory.build(je);
assertNotNull(xml); assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml)); final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc); assertNotNull(doc);
System.out.println(doc.asXML()); System.out.println(doc.asXML());
Assertions.assertEquals("2021-01-01", doc.valueOf("//validated/@date")); Assertions.assertEquals("2021-01-01", doc.valueOf("//validated/@date"));
} }
@Test @Test
void testXMLRecordFactoryWithNonValidatedProject() throws IOException, DocumentException { public void testXMLRecordFactoryWithNonValidatedProject() throws IOException, DocumentException {
ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
otherDsTypeId); XmlConverterJob.schemaLocation);
Publication p = OBJECT_MAPPER final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
Project pj = OBJECT_MAPPER final Project pj = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class);
Relation rel = OBJECT_MAPPER final Relation rel = OBJECT_MAPPER
.readValue((IOUtils.toString(getClass().getResourceAsStream("relToProject.json"))), Relation.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("relToProject.json")), Relation.class);
RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class); final RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class);
List<RelatedEntityWrapper> links = Lists.newArrayList(); final List<RelatedEntityWrapper> links = Lists.newArrayList();
RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject); final RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject);
links.add(rew); links.add(rew);
JoinedEntity je = new JoinedEntity<>(p); final JoinedEntity je = new JoinedEntity<>(p);
je.setLinks(links); je.setLinks(links);
String xml = xmlRecordFactory.build(je); final String xml = xmlRecordFactory.build(je);
assertNotNull(xml); assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml)); final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc); assertNotNull(doc);
System.out.println(doc.asXML()); System.out.println(doc.asXML());
assertEquals("", doc.valueOf("//rel/validated")); assertEquals("", doc.valueOf("//rel/validated"));
} }
@Test
void testEnermapsRecord() throws IOException, DocumentException, SAXException {
String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>"
+
"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>" +
"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>" +
"</entries>";
ContextMapper contextMapper = ContextMapper.fromXml(contextmap);
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION,
otherDsTypeId);
Dataset d = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
JoinedEntity je = new JoinedEntity<>(d);
String xml = xmlRecordFactory.build(je);
assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
System.out.println(doc.asXML());
assertEquals("enermaps::selection::tgs00004", doc.valueOf("//concept/@id"));
}
} }

17
pom.xml
View File

@ -205,11 +205,6 @@
<artifactId>dateparser</artifactId> <artifactId>dateparser</artifactId>
<version>1.0.7</version> <version>1.0.7</version>
</dependency> </dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
<version>0.0.7</version>
</dependency>
<dependency> <dependency>
<groupId>com.google.guava</groupId> <groupId>com.google.guava</groupId>
@ -519,16 +514,6 @@
<version>${common.text.version}</version> <version>${common.text.version}</version>
</dependency> </dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>5.5</version>
</dependency>
<dependency>
<groupId>io.github.classgraph</groupId>
<artifactId>classgraph</artifactId>
<version>4.8.71</version>
</dependency>
</dependencies> </dependencies>
</dependencyManagement> </dependencyManagement>
@ -751,7 +736,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.7.15]</dhp-schemas.version> <dhp-schemas.version>[2.7.15-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>