1
0
Fork 0

Merge pull request 'towards EOSC datasource profiles' (#130) from datasource_model_eosc_beta into beta

Reviewed-on: D-Net/dnet-hadoop#130
This commit is contained in:
Alessia Bardi 2021-08-23 11:58:34 +02:00
commit 3bcac7e88c
16 changed files with 770 additions and 728 deletions

View File

@ -1,17 +1,7 @@
package eu.dnetlib.dhp.actionmanager.project;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.BufferedReader;
import java.io.File;
@ -19,127 +9,140 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class DownloadCsvTest {
private static String workingDir;
private static String workingDir;
private static LocalFileSystem fs;
private static LocalFileSystem fs;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(DownloadCsvTest.class.getSimpleName())
.toString();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(DownloadCsvTest.class.getSimpleName())
.toString();
fs = FileSystem.getLocal(new Configuration());
}
fs = FileSystem.getLocal(new Configuration());
}
@Disabled
@Test
void getProgrammeFileTest() throws Exception {
@Disabled
@Test
void getProgrammeFileTest() throws Exception {
String fileURL = "https://cordis.europa.eu/data/reference/cordisref-h2020programmes.csv";
String fileURL = "https://cordis.europa.eu/data/reference/cordisref-h2020programmes.csv";
GetCSV
.getCsv(
fs, new BufferedReader(
new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))),
workingDir + "/programme",
CSVProgramme.class.getName(), ';');
GetCSV
.getCsv(
fs, new BufferedReader(
new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))),
workingDir + "/programme",
CSVProgramme.class.getName(), ';');
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/programme"))));
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/programme"))));
String line;
int count = 0;
while ((line = in.readLine()) != null) {
CSVProgramme csvp = new ObjectMapper().readValue(line, CSVProgramme.class);
if (count == 0) {
assertTrue(csvp.getCode().equals("H2020-EU.5.f."));
assertTrue(
csvp
.getTitle()
.startsWith(
"Develop the governance for the advancement of responsible research and innovation by all stakeholders"));
assertTrue(csvp.getTitle().endsWith("promote an ethics framework for research and innovation"));
assertTrue(csvp.getShortTitle().equals(""));
assertTrue(csvp.getLanguage().equals("en"));
}
if (count == 28) {
assertTrue(csvp.getCode().equals("H2020-EU.3.5.4."));
assertTrue(
csvp
.getTitle()
.equals(
"Grundlagen für den Übergang zu einer umweltfreundlichen Wirtschaft und Gesellschaft durch Öko-Innovation"));
assertTrue(csvp.getShortTitle().equals("A green economy and society through eco-innovation"));
assertTrue(csvp.getLanguage().equals("de"));
}
if (count == 229) {
assertTrue(csvp.getCode().equals("H2020-EU.3.2."));
assertTrue(
csvp
.getTitle()
.equals(
"SOCIETAL CHALLENGES - Food security, sustainable agriculture and forestry, marine, maritime and inland water research, and the bioeconomy"));
assertTrue(
csvp.getShortTitle().equals("Food, agriculture, forestry, marine research and bioeconomy"));
assertTrue(csvp.getLanguage().equals("en"));
}
assertTrue(csvp.getCode() != null);
assertTrue(csvp.getCode().startsWith("H2020"));
count += 1;
}
String line;
int count = 0;
while ((line = in.readLine()) != null) {
CSVProgramme csvp = new ObjectMapper().readValue(line, CSVProgramme.class);
if (count == 0) {
assertTrue(csvp.getCode().equals("H2020-EU.5.f."));
assertTrue(
csvp
.getTitle()
.startsWith(
"Develop the governance for the advancement of responsible research and innovation by all stakeholders"));
assertTrue(csvp.getTitle().endsWith("promote an ethics framework for research and innovation"));
assertTrue(csvp.getShortTitle().equals(""));
assertTrue(csvp.getLanguage().equals("en"));
}
if (count == 28) {
assertTrue(csvp.getCode().equals("H2020-EU.3.5.4."));
assertTrue(
csvp
.getTitle()
.equals(
"Grundlagen für den Übergang zu einer umweltfreundlichen Wirtschaft und Gesellschaft durch Öko-Innovation"));
assertTrue(csvp.getShortTitle().equals("A green economy and society through eco-innovation"));
assertTrue(csvp.getLanguage().equals("de"));
}
if (count == 229) {
assertTrue(csvp.getCode().equals("H2020-EU.3.2."));
assertTrue(
csvp
.getTitle()
.equals(
"SOCIETAL CHALLENGES - Food security, sustainable agriculture and forestry, marine, maritime and inland water research, and the bioeconomy"));
assertTrue(
csvp.getShortTitle().equals("Food, agriculture, forestry, marine research and bioeconomy"));
assertTrue(csvp.getLanguage().equals("en"));
}
assertTrue(csvp.getCode() != null);
assertTrue(csvp.getCode().startsWith("H2020"));
count += 1;
}
Assertions.assertEquals(767, count);
}
Assertions.assertEquals(767, count);
}
@Disabled
@Test
void getProjectFileTest() throws IOException, CollectorException, ClassNotFoundException {
String fileURL = "https://cordis.europa.eu/data/cordis-h2020projects.csv";
@Disabled
@Test
void getProjectFileTest() throws IOException, CollectorException, ClassNotFoundException {
String fileURL = "https://cordis.europa.eu/data/cordis-h2020projects.csv";
GetCSV
.getCsv(
fs,
new BufferedReader(new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL)))
, workingDir + "/projects",
CSVProject.class.getName(), ';');
GetCSV
.getCsv(
fs,
new BufferedReader(new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))),
workingDir + "/projects",
CSVProject.class.getName(), ';');
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/projects"))));
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(workingDir + "/projects"))));
String line;
int count = 0;
while ((line = in.readLine()) != null) {
CSVProject csvp = new ObjectMapper().readValue(line, CSVProject.class);
if (count == 0) {
assertTrue(csvp.getId().equals("771736"));
assertTrue(csvp.getProgramme().equals("H2020-EU.1.1."));
assertTrue(csvp.getTopics().equals("ERC-2017-COG"));
String line;
int count = 0;
while ((line = in.readLine()) != null) {
CSVProject csvp = new ObjectMapper().readValue(line, CSVProject.class);
if (count == 0) {
assertTrue(csvp.getId().equals("771736"));
assertTrue(csvp.getProgramme().equals("H2020-EU.1.1."));
assertTrue(csvp.getTopics().equals("ERC-2017-COG"));
}
if (count == 22882) {
assertTrue(csvp.getId().equals("752903"));
assertTrue(csvp.getProgramme().equals("H2020-EU.1.3.2."));
assertTrue(csvp.getTopics().equals("MSCA-IF-2016"));
}
if (count == 223023) {
assertTrue(csvp.getId().equals("861952"));
assertTrue(csvp.getProgramme().equals("H2020-EU.4.e."));
assertTrue(csvp.getTopics().equals("SGA-SEWP-COST-2019"));
}
assertTrue(csvp.getId() != null);
assertTrue(csvp.getProgramme().startsWith("H2020"));
count += 1;
}
}
if (count == 22882) {
assertTrue(csvp.getId().equals("752903"));
assertTrue(csvp.getProgramme().equals("H2020-EU.1.3.2."));
assertTrue(csvp.getTopics().equals("MSCA-IF-2016"));
}
if (count == 223023) {
assertTrue(csvp.getId().equals("861952"));
assertTrue(csvp.getProgramme().equals("H2020-EU.4.e."));
assertTrue(csvp.getTopics().equals("SGA-SEWP-COST-2019"));
}
assertTrue(csvp.getId() != null);
assertTrue(csvp.getProgramme().startsWith("H2020"));
count += 1;
}
Assertions.assertEquals(34957, count);
}
Assertions.assertEquals(34957, count);
}
@AfterAll
public static void cleanup() {
FileUtils.deleteQuietly(new File(workingDir));
}
@AfterAll
public static void cleanup() {
FileUtils.deleteQuietly(new File(workingDir));
}
}

View File

@ -33,7 +33,7 @@ public class Process implements Serializable {
ri.setType(Constants.RESEARCH_INFRASTRUCTURE);
}
ri.setId(Utils.getContextId(ci.getId()));
ri.setOriginalId(ci.getId());
ri.setAcronym(ci.getId());
ri.setDescription(ci.getDescription());
ri.setName(ci.getName());

View File

@ -1,15 +1,51 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_MERGED_IN;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.MERGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.asString;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Array;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;
@ -50,8 +86,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
private static final Logger log = LoggerFactory.getLogger(MigrateDbEntitiesApplication.class);
private static final DataInfo DATA_INFO_CLAIM = dataInfo(
false, null, false, false,
qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9");
false, null, false, false, qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS),
"0.9");
private static final List<KeyValue> COLLECTED_FROM_CLAIM = listKeyValues(
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
@ -69,10 +105,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
MigrateDbEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json"))));
MigrateDbEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json")));
parser.parseArgument(args);
@ -86,7 +120,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
log.info("postgresPassword: xxx");
final String dbSchema = parser.get("dbschema");
log.info("dbSchema {}: ", dbSchema);
log.info("dbSchema {}: " + dbSchema);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
@ -139,8 +173,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
case openorgs_dedup: // generates organization entities and relations for openorgs dedup
log.info("Processing Openorgs...");
smdbe
.execute(
"queryOpenOrgsForOrgsDedup.sql", smdbe::processOrganization, verifyNamespacePrefix);
.execute("queryOpenOrgsForOrgsDedup.sql", smdbe::processOrganization, verifyNamespacePrefix);
log.info("Processing Openorgs Sim Rels...");
smdbe.execute("queryOpenOrgsSimilarityForOrgsDedup.sql", smdbe::processOrgOrgSimRels);
@ -149,8 +182,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
case openorgs: // generates organization entities and relations for provision
log.info("Processing Openorgs For Provision...");
smdbe
.execute(
"queryOpenOrgsForProvision.sql", smdbe::processOrganization, verifyNamespacePrefix);
.execute("queryOpenOrgsForProvision.sql", smdbe::processOrganization, verifyNamespacePrefix);
log.info("Processing Openorgs Merge Rels...");
smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgMergeRels);
@ -228,6 +260,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
ds.setOaiprovenance(null); // Values not present in the DB
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
ds.setDatasourcetypeui(prepareQualifierSplitting(rs.getString("datasourcetypeui")));
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
ds.setOfficialname(field(rs.getString("officialname"), info));
ds.setEnglishname(field(rs.getString("englishname"), info));
@ -269,6 +302,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp);
ds.setJurisdiction(prepareQualifierSplitting(rs.getString("jurisdiction")));
ds.setThematic(rs.getBoolean("thematic"));
ds.setKnowledgegraph(rs.getBoolean("knowledgegraph"));
ds.setContentpolicies(prepareListOfQualifiers(rs.getArray("contentpolicies")));
return Arrays.asList(ds);
} catch (final Exception e) {
throw new RuntimeException(e);
@ -494,8 +532,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
throw new IllegalStateException(
String
.format(
"invalid claim, sourceId: %s, targetId: %s, semantics: %s",
sourceId, targetId, semantics));
"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
semantics));
}
r1 = setRelationSemantic(r1, RESULT_PROJECT, OUTCOME, PRODUCES);
r2 = setRelationSemantic(r2, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY);
@ -515,8 +553,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
}
}
private Relation prepareRelation(String sourceId, String targetId, String validationDate) {
Relation r = new Relation();
private Relation prepareRelation(final String sourceId, final String targetId, final String validationDate) {
final Relation r = new Relation();
if (StringUtils.isNotBlank(validationDate)) {
r.setValidated(true);
r.setValidationDate(validationDate);
@ -529,7 +567,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return r;
}
private Relation setRelationSemantic(Relation r, String relType, String subRelType, String relClass) {
private Relation setRelationSemantic(final Relation r, final String relType, final String subRelType,
final String relClass) {
r.setRelType(relType);
r.setSubRelType(subRelType);
r.setRelClass(relClass);
@ -602,6 +641,19 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return res;
}
private List<Qualifier> prepareListOfQualifiers(final Array array) throws SQLException {
final List<Qualifier> res = new ArrayList<>();
if (array != null) {
for (final String s : (String[]) array.getArray()) {
final Qualifier q = prepareQualifierSplitting(s);
if (q != null) {
res.add(q);
}
}
}
return res;
}
public List<Oaf> processOrgOrgMergeRels(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs); // TODO
@ -658,6 +710,18 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
// removed because there's no difference between two sides //TODO
// final Relation r2 = new Relation();
// r2.setRelType(ORG_ORG_RELTYPE);
// r2.setSubRelType(ORG_ORG_SUBRELTYPE);
// r2.setRelClass(relClass);
// r2.setSource(orgId2);
// r2.setTarget(orgId1);
// r2.setCollectedfrom(collectedFrom);
// r2.setDataInfo(info);
// r2.setLastupdatetimestamp(lastUpdateTimestamp);
// return Arrays.asList(r1, r2);
return Arrays.asList(r1);
} catch (final Exception e) {
throw new RuntimeException(e);

View File

@ -165,7 +165,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
return Arrays.asList(instance);
}
protected String trimAndDecodeUrl(String url){
protected String trimAndDecodeUrl(String url) {
try {
return URLDecoder.decode(url.trim(), "UTF-8");
} catch (UnsupportedEncodingException e) {

View File

@ -84,13 +84,18 @@ SELECT
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
d.typology||'@@@dnet:datasource_typologies_ui' AS datasourcetypeui,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
d.issn AS issnPrinted,
d.eissn AS issnOnline,
d.lissn AS issnLinking
d.lissn AS issnLinking,
de.jurisdiction||'@@@eosc:jurisdictions' AS jurisdiction,
de.thematic AS thematic,
de.knowledge_graph AS knowledgegraph,
array(select unnest(de.content_policies)||'@@@eosc:contentpolicies') AS contentpolicies
FROM dsm_datasources d
LEFT OUTER JOIN dsm_datasources_eosc de on (d.id = de.id)
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
@ -126,4 +131,8 @@ GROUP BY
dc.officialname,
d.issn,
d.eissn,
d.lissn
d.lissn,
de.jurisdiction,
de.thematic,
de.knowledge_graph,
de.content_policies

View File

@ -11,7 +11,6 @@ import java.util.List;
import java.util.Objects;
import java.util.Optional;
import com.fasterxml.jackson.core.JsonProcessingException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.DocumentException;
@ -21,6 +20,7 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
@ -760,7 +760,6 @@ class MappersTest {
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
}
@Test
void testXMLEncodedURL() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml")));
@ -779,7 +778,8 @@ class MappersTest {
@Test
void testXMLEncodedURL_ODF() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
@ -788,7 +788,7 @@ class MappersTest {
final Dataset p = (Dataset) list.get(0);
assertTrue(p.getInstance().size() > 0);
for(String url : p.getInstance().get(0).getUrl()){
for (String url : p.getInstance().get(0).getUrl()) {
System.out.println(url);
assertTrue(!url.contains("&amp;"));
}

View File

@ -28,11 +28,16 @@ import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ExtendWith(MockitoExtension.class)
class MigrateDbEntitiesApplicationTest {
public class MigrateDbEntitiesApplicationTest {
private MigrateDbEntitiesApplication app;
@ -46,11 +51,8 @@ class MigrateDbEntitiesApplicationTest {
public void setUp() {
lenient()
.when(vocs.getTermAsQualifier(anyString(), anyString()))
.thenAnswer(
invocation -> OafMapperUtils
.qualifier(
invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
invocation.getArgument(0)));
.thenAnswer(invocation -> OafMapperUtils
.qualifier(invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0), invocation.getArgument(0)));
lenient().when(vocs.termExists(anyString(), anyString())).thenReturn(true);
@ -58,7 +60,7 @@ class MigrateDbEntitiesApplicationTest {
}
@Test
void testProcessDatasource() throws Exception {
public void testProcessDatasource() throws Exception {
final List<TypedField> fields = prepareMocks("datasources_resultset_entry.json");
final List<Oaf> list = app.processDatasource(rs);
@ -78,10 +80,27 @@ class MigrateDbEntitiesApplicationTest {
assertEquals(getValueAsString("issnPrinted", fields), ds.getJournal().getIssnPrinted());
assertEquals(getValueAsString("issnOnline", fields), ds.getJournal().getIssnOnline());
assertEquals(getValueAsString("issnLinking", fields), ds.getJournal().getIssnLinking());
assertEquals("pubsrepository::journal", ds.getDatasourcetype().getClassid());
assertEquals("dnet:datasource_typologies", ds.getDatasourcetype().getSchemeid());
assertEquals("pubsrepository::journal", ds.getDatasourcetypeui().getClassid());
assertEquals("dnet:datasource_typologies_ui", ds.getDatasourcetypeui().getSchemeid());
assertEquals("National", ds.getJurisdiction().getClassid());
assertEquals("eosc:jurisdictions", ds.getJurisdiction().getSchemeid());
assertTrue(ds.getThematic());
assertTrue(ds.getKnowledgegraph());
assertEquals(1, ds.getContentpolicies().size());
assertEquals("Journal article", ds.getContentpolicies().get(0).getClassid());
assertEquals("eosc:contentpolicies", ds.getContentpolicies().get(0).getSchemeid());
}
@Test
void testProcessProject() throws Exception {
public void testProcessProject() throws Exception {
final List<TypedField> fields = prepareMocks("projects_resultset_entry.json");
final List<Oaf> list = app.processProject(rs);
@ -99,7 +118,7 @@ class MigrateDbEntitiesApplicationTest {
}
@Test
void testProcessOrganization() throws Exception {
public void testProcessOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json");
final List<Oaf> list = app.processOrganization(rs);
@ -119,14 +138,14 @@ class MigrateDbEntitiesApplicationTest {
assertEquals(getValueAsString("country", fields).split("@@@")[1], o.getCountry().getSchemeid());
assertEquals(getValueAsString("country", fields).split("@@@")[1], o.getCountry().getSchemename());
assertEquals(getValueAsString("collectedfromname", fields), o.getCollectedfrom().get(0).getValue());
List<String> alternativenames = getValueAsList("alternativenames", fields);
final List<String> alternativenames = getValueAsList("alternativenames", fields);
assertEquals(2, alternativenames.size());
assertTrue(alternativenames.contains("Pippo"));
assertTrue(alternativenames.contains("Foo"));
}
@Test
void testProcessDatasourceOrganization() throws Exception {
public void testProcessDatasourceOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
final List<Oaf> list = app.processDatasourceOrganization(rs);
@ -143,7 +162,7 @@ class MigrateDbEntitiesApplicationTest {
}
@Test
void testProcessProjectOrganization() throws Exception {
public void testProcessProjectOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json");
final List<Oaf> list = app.processProjectOrganization(rs);
@ -162,7 +181,7 @@ class MigrateDbEntitiesApplicationTest {
}
@Test
void testProcessClaims_context() throws Exception {
public void testProcessClaims_context() throws Exception {
final List<TypedField> fields = prepareMocks("claimscontext_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs);
@ -177,7 +196,7 @@ class MigrateDbEntitiesApplicationTest {
}
@Test
void testProcessClaims_rels() throws Exception {
public void testProcessClaims_rels() throws Exception {
final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs);
@ -208,78 +227,80 @@ class MigrateDbEntitiesApplicationTest {
assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey());
// System.out.println(new ObjectMapper().writeValueAsString(r1));
// System.out.println(new ObjectMapper().writeValueAsString(r2));
}
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));
final ObjectMapper mapper = new ObjectMapper();
final List<TypedField> list = mapper.readValue(json, new TypeReference<List<TypedField>>() {
});
final List<TypedField> list = mapper.readValue(json, new TypeReference<List<TypedField>>() {});
for (final TypedField tf : list) {
if (tf.getValue() == null) {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false);
break;
case "date":
Mockito.when(rs.getDate(tf.getField())).thenReturn(null);
break;
case "int":
Mockito.when(rs.getInt(tf.getField())).thenReturn(0);
break;
case "double":
Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0);
break;
case "array":
Mockito.when(rs.getArray(tf.getField())).thenReturn(null);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(null);
break;
case "not_used":
break;
case "boolean":
Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false);
break;
case "date":
Mockito.when(rs.getDate(tf.getField())).thenReturn(null);
break;
case "int":
Mockito.when(rs.getInt(tf.getField())).thenReturn(0);
break;
case "double":
Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0);
break;
case "array":
Mockito.when(rs.getArray(tf.getField())).thenReturn(null);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(null);
break;
}
} else {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito
.when(rs.getBoolean(tf.getField()))
.thenReturn(Boolean.parseBoolean(tf.getValue().toString()));
break;
case "date":
Mockito
.when(rs.getDate(tf.getField()))
.thenReturn(Date.valueOf(tf.getValue().toString()));
break;
case "int":
Mockito
.when(rs.getInt(tf.getField()))
.thenReturn(new Integer(tf.getValue().toString()));
break;
case "double":
Mockito
.when(rs.getDouble(tf.getField()))
.thenReturn(new Double(tf.getValue().toString()));
break;
case "array":
final Array arr = Mockito.mock(Array.class);
final String[] values = ((List<?>) tf.getValue())
.stream()
.filter(Objects::nonNull)
.map(Object::toString)
.toArray(String[]::new);
case "not_used":
break;
case "boolean":
Mockito
.when(rs.getBoolean(tf.getField()))
.thenReturn(Boolean.parseBoolean(tf.getValue().toString()));
break;
case "date":
Mockito
.when(rs.getDate(tf.getField()))
.thenReturn(Date.valueOf(tf.getValue().toString()));
break;
case "int":
Mockito
.when(rs.getInt(tf.getField()))
.thenReturn(new Integer(tf.getValue().toString()));
break;
case "double":
Mockito
.when(rs.getDouble(tf.getField()))
.thenReturn(new Double(tf.getValue().toString()));
break;
case "array":
final Array arr = Mockito.mock(Array.class);
final String[] values = ((List<?>) tf.getValue())
.stream()
.filter(Objects::nonNull)
.map(o -> o.toString())
.toArray(String[]::new);
Mockito.when(arr.getArray()).thenReturn(values);
Mockito.when(rs.getArray(tf.getField())).thenReturn(arr);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString());
break;
Mockito.when(arr.getArray()).thenReturn(values);
Mockito.when(rs.getArray(tf.getField())).thenReturn(arr);
break;
case "string":
default:
Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString());
break;
}
}
}
@ -291,27 +312,27 @@ class MigrateDbEntitiesApplicationTest {
for (final TypedField tf : list) {
switch (tf.getType()) {
case "not_used":
break;
case "boolean":
Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField());
break;
case "date":
Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField());
break;
case "int":
Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField());
break;
case "double":
Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField());
break;
case "array":
Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField());
break;
case "string":
default:
Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField());
break;
case "not_used":
break;
case "boolean":
Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField());
break;
case "date":
Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField());
break;
case "int":
Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField());
break;
case "double":
Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField());
break;
case "array":
Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField());
break;
case "string":
default:
Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField());
break;
}
}
}
@ -331,7 +352,6 @@ class MigrateDbEntitiesApplicationTest {
return new Float(getValueAs(name, fields).toString());
}
@SuppressWarnings("unchecked")
private <T> T getValueAs(final String name, final List<TypedField> fields) {
return fields
.stream()

View File

@ -222,6 +222,11 @@
"type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies"
},
{
"field": "datasourcetypeui",
"type": "string",
"value": "pubsrepository::journal@@@dnet:datasource_typologies_ui"
},
{
"field": "provenanceaction",
"type": "not_used",
@ -241,5 +246,27 @@
"field": "issnLinking",
"type": "string",
"value": "2579-5447"
},
{
"field": "jurisdiction",
"type": "string",
"value": "National@@@eosc:jurisdictions"
},
{
"field": "thematic",
"type": "boolean",
"value": true
},
{
"field": "knowledgegraph",
"type": "boolean",
"value": true
},
{
"field": "contentpolicies",
"type": "array",
"value": [
"Journal article@@@eosc:contentpolicies"
]
}
]

View File

@ -27,14 +27,20 @@ import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import scala.Tuple2;
/**
* CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type
* E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join
* (R.target = T_i.id) save the tuples (R_i, T_i)
* CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type E_i map E_i as RelatedEntity
* T_i to simplify the model and extracting only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i)
*/
public class CreateRelatedEntitiesJob_phase1 {
@ -42,71 +48,65 @@ public class CreateRelatedEntitiesJob_phase1 {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
public static void main(final String[] args) throws Exception {
String jsonConfiguration = IOUtils
final String jsonConfiguration = IOUtils
.toString(
Objects
.requireNonNull(
CreateRelatedEntitiesJob_phase1.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")));
PrepareRelationsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputRelationsPath = parser.get("inputRelationsPath");
final String inputRelationsPath = parser.get("inputRelationsPath");
log.info("inputRelationsPath: {}", inputRelationsPath);
String inputEntityPath = parser.get("inputEntityPath");
final String inputEntityPath = parser.get("inputEntityPath");
log.info("inputEntityPath: {}", inputEntityPath);
String outputPath = parser.get("outputPath");
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String graphTableClassName = parser.get("graphTableClassName");
final String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
@SuppressWarnings("unchecked")
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
final Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
final SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ProvisionModelSupport.getModelClasses());
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath);
});
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, outputPath);
joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath);
});
}
private static <E extends OafEntity> void joinRelationEntity(
SparkSession spark,
String inputRelationsPath,
String inputEntityPath,
Class<E> clazz,
String outputPath) {
final SparkSession spark,
final String inputRelationsPath,
final String inputEntityPath,
final Class<E> clazz,
final String outputPath) {
Dataset<Tuple2<String, Relation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
final Dataset<Tuple2<String, Relation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
.map(
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getTarget(),
r),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class)))
.cache();
Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, clazz)
final Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, clazz)
.filter("dataInfo.invisible == false")
.map(
(MapFunction<E, Tuple2<String, RelatedEntity>>) e -> new Tuple2<>(e.getId(), asRelatedEntity(e, clazz)),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class)))
Encoders
.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class)))
.cache();
relsByTarget
@ -121,7 +121,9 @@ public class CreateRelatedEntitiesJob_phase1 {
}
private static <E extends OafEntity> Dataset<E> readPathEntity(
SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
final SparkSession spark,
final String inputEntityPath,
final Class<E> entityClazz) {
log.info("Reading Graph table from: {}", inputEntityPath);
return spark
@ -132,7 +134,7 @@ public class CreateRelatedEntitiesJob_phase1 {
Encoders.bean(entityClazz));
}
public static <E extends OafEntity> RelatedEntity asRelatedEntity(E entity, Class<E> clazz) {
public static <E extends OafEntity> RelatedEntity asRelatedEntity(final E entity, final Class<E> clazz) {
final RelatedEntity re = new RelatedEntity();
re.setId(entity.getId());
@ -146,14 +148,10 @@ public class CreateRelatedEntitiesJob_phase1 {
case dataset:
case otherresearchproduct:
case software:
Result result = (Result) entity;
final Result result = (Result) entity;
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
final StructuredProperty title = result
.getTitle()
.stream()
.findFirst()
.orElseThrow(() -> new IllegalStateException("missing title in " + entity.getId()));
final StructuredProperty title = result.getTitle().stream().findFirst().get();
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
re.setTitle(title);
}
@ -177,16 +175,17 @@ public class CreateRelatedEntitiesJob_phase1 {
break;
case datasource:
Datasource d = (Datasource) entity;
final Datasource d = (Datasource) entity;
re.setOfficialname(getValue(d.getOfficialname()));
re.setWebsiteurl(getValue(d.getWebsiteurl()));
re.setDatasourcetype(d.getDatasourcetype());
re.setDatasourcetypeui(d.getDatasourcetypeui());
re.setOpenairecompatibility(d.getOpenairecompatibility());
break;
case organization:
Organization o = (Organization) entity;
final Organization o = (Organization) entity;
re.setLegalname(getValue(o.getLegalname()));
re.setLegalshortname(getValue(o.getLegalshortname()));
@ -194,50 +193,50 @@ public class CreateRelatedEntitiesJob_phase1 {
re.setWebsiteurl(getValue(o.getWebsiteurl()));
break;
case project:
Project p = (Project) entity;
final Project p = (Project) entity;
re.setProjectTitle(getValue(p.getTitle()));
re.setCode(getValue(p.getCode()));
re.setAcronym(getValue(p.getAcronym()));
re.setContracttype(p.getContracttype());
List<Field<String>> f = p.getFundingtree();
final List<Field<String>> f = p.getFundingtree();
if (!f.isEmpty()) {
re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList()));
re.setFundingtree(f.stream().map(s -> s.getValue()).collect(Collectors.toList()));
}
break;
}
return re;
}
private static String getValue(Field<String> field) {
private static String getValue(final Field<String> field) {
return getFieldValueWithDefault(field, "");
}
private static <T> T getFieldValueWithDefault(Field<T> f, T defaultValue) {
private static <T> T getFieldValueWithDefault(final Field<T> f, final T defaultValue) {
return Optional
.ofNullable(f)
.filter(Objects::nonNull)
.map(Field::getValue)
.map(x -> x.getValue())
.orElse(defaultValue);
}
/**
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
* file
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file,
*
* @param spark the SparkSession
* @param relationPath the path storing the relation objects
* @param spark
* @param relationPath
* @return the Dataset<SortableRelation> containing all the relationships
*/
private static Dataset<Relation> readPathRelation(
SparkSession spark, final String relationPath) {
final SparkSession spark,
final String relationPath) {
log.info("Reading relations from: {}", relationPath);
return spark.read().load(relationPath).as(Encoders.bean(Relation.class));
}
private static void removeOutputDir(SparkSession spark, String path) {
private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -39,63 +39,53 @@ public class XmlConverterJob {
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
public static final String SCHEMA_LOCATION = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
public static void main(String[] args) throws Exception {
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
XmlConverterJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String isLookupUrl = parser.get("isLookupUrl");
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
String otherDsTypeId = parser.get("otherDsTypeId");
log.info("otherDsTypeId: {}", otherDsTypeId);
SparkConf conf = new SparkConf();
final SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ProvisionModelSupport.getModelClasses());
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
convertToXml(
spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId);
});
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, outputPath);
convertToXml(spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl));
});
}
private static void convertToXml(
SparkSession spark,
String inputPath,
String outputPath,
ContextMapper contextMapper,
String otherDsTypeId) {
final SparkSession spark,
final String inputPath,
final String outputPath,
final ContextMapper contextMapper) {
final XmlRecordFactory recordFactory = new XmlRecordFactory(
prepareAccumulators(spark.sparkContext()),
contextMapper,
false,
SCHEMA_LOCATION,
otherDsTypeId);
schemaLocation);
final List<String> paths = HdfsSupport
.listFiles(inputPath, spark.sparkContext().hadoopConfiguration());
@ -115,16 +105,15 @@ public class XmlConverterJob {
.mapToPair(
(PairFunction<Tuple2<String, String>, Text, Text>) t -> new Tuple2<>(new Text(t._1()),
new Text(t._2())))
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
private static void removeOutputDir(SparkSession spark, String path) {
private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static Map<String, LongAccumulator> prepareAccumulators(SparkContext sc) {
Map<String, LongAccumulator> accumulators = Maps.newHashMap();
private static Map<String, LongAccumulator> prepareAccumulators(final SparkContext sc) {
final Map<String, LongAccumulator> accumulators = Maps.newHashMap();
accumulators
.put(
"resultResult_similarity_isAmongTopNSimilarDocuments",
@ -135,15 +124,13 @@ public class XmlConverterJob {
sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
accumulators
.put(
"resultResult_supplement_isSupplementTo",
sc.longAccumulator("resultResult_supplement_isSupplementTo"));
"resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
accumulators
.put(
"resultResult_supplement_isSupplementedBy",
sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
accumulators
.put(
"resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
accumulators
@ -151,16 +138,11 @@ public class XmlConverterJob {
"resultResult_publicationDataset_isRelatedTo",
sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
accumulators
.put(
"resultResult_relationship_isRelatedTo",
sc.longAccumulator("resultResult_relationship_isRelatedTo"));
.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
accumulators
.put(
"resultProject_outcome_isProducedBy",
sc.longAccumulator("resultProject_outcome_isProducedBy"));
.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
accumulators
.put(
"resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
accumulators
.put(
"resultOrganization_affiliation_isAuthorInstitutionOf",
@ -183,9 +165,7 @@ public class XmlConverterJob {
"organizationOrganization_dedup_isMergedIn",
sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
accumulators
.put(
"organizationOrganization_dedup_merges",
sc.longAccumulator("organizationOrganization_dedup_merges"));
.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
accumulators
.put(
"datasourceOrganization_provision_isProvidedBy",

View File

@ -16,11 +16,5 @@
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "odt",
"paramLongName": "otherDsTypeId",
"paramDescription": "list of datasource types to populate field datasourcetypeui",
"paramRequired": true
}
]

View File

@ -25,10 +25,6 @@
<name>targetMaxRelations</name>
<description>maximum number of relations allowed for a each entity grouping by target</description>
</property>
<property>
<name>otherDsTypeId</name>
<description>mapping used to populate datasourceTypeUi field</description>
</property>
<property>
<name>format</name>
<description>metadata format name (DMF|TMF)</description>
@ -582,7 +578,6 @@
<arg>--inputPath</arg><arg>${workingDir}/join_entities</arg>
<arg>--outputPath</arg><arg>${workingDir}/xml</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--otherDsTypeId</arg><arg>${otherDsTypeId}</arg>
</spark>
<ok to="should_index"/>
<error to="Kill"/>

View File

@ -4,7 +4,6 @@ package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException;
import java.util.Objects;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
@ -33,7 +32,7 @@ import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
*
* The input is a JoinedEntity, i.e. a json representation of an OpenAIRE entity that embeds all the linked entities.
*/
class IndexRecordTransformerTest {
public class IndexRecordTransformerTest {
public static final String VERSION = "2021-04-15T10:05:53Z";
public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl";
@ -46,23 +45,23 @@ class IndexRecordTransformerTest {
}
@Test
void testPreBuiltRecordTransformation() throws IOException, TransformerException {
String record = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("record.xml")));
public void testPreBuiltRecordTransformation() throws IOException, TransformerException {
final String record = IOUtils.toString(getClass().getResourceAsStream("record.xml"));
testRecordTransformation(record);
}
@Test
void testPublicationRecordTransformation() throws IOException, TransformerException {
public void testPublicationRecordTransformation() throws IOException, TransformerException {
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION,
XmlRecordFactoryTest.otherDsTypeId);
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
Publication p = load("publication.json", Publication.class);
Project pj = load("project.json", Project.class);
Relation rel = load("relToValidatedProject.json", Relation.class);
final Publication p = load("publication.json", Publication.class);
final Project pj = load("project.json", Project.class);
final Relation rel = load("relToValidatedProject.json", Relation.class);
JoinedEntity<Publication> je = new JoinedEntity<>(p);
final JoinedEntity je = new JoinedEntity<>(p);
je
.setLinks(
Lists
@ -70,25 +69,25 @@ class IndexRecordTransformerTest {
new RelatedEntityWrapper(rel,
CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class))));
String record = xmlRecordFactory.build(je);
final String record = xmlRecordFactory.build(je);
assertNotNull(record);
testRecordTransformation(record);
}
private void testRecordTransformation(String record) throws IOException, TransformerException {
String fields = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("fields.xml")));
String xslt = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("layoutToRecordTransformer.xsl")));
private void testRecordTransformation(final String record) throws IOException, TransformerException {
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt);
final String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt);
Transformer tr = SaxonTransformerFactory.newInstance(transformer);
final Transformer tr = SaxonTransformerFactory.newInstance(transformer);
String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record);
final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record);
SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID).parseDocument(indexRecordXML);
final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID)
.parseDocument(indexRecordXML);
final String xmlDoc = ClientUtils.toXML(solrDoc);
@ -96,9 +95,9 @@ class IndexRecordTransformerTest {
System.out.println(xmlDoc);
}
private <T> T load(String fileName, Class<T> clazz) throws IOException {
private <T> T load(final String fileName, final Class<T> clazz) throws IOException {
return XmlRecordFactoryTest.OBJECT_MAPPER
.readValue(IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(fileName))), clazz);
.readValue(IOUtils.toString(getClass().getResourceAsStream(fileName)), clazz);
}
}

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException;
import java.io.StringReader;
@ -13,7 +14,6 @@ import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
@ -24,34 +24,31 @@ import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
class XmlRecordFactoryTest {
public static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource";
public class XmlRecordFactoryTest {
public static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
void testXMLRecordFactory() throws IOException, DocumentException {
public void testXMLRecordFactory() throws IOException, DocumentException {
ContextMapper contextMapper = new ContextMapper();
final ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION,
otherDsTypeId);
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
Publication p = OBJECT_MAPPER
final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
String xml = xmlRecordFactory.build(new JoinedEntity<>(p));
final String xml = xmlRecordFactory.build(new JoinedEntity<>(p));
assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml));
final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
@ -72,93 +69,64 @@ class XmlRecordFactoryTest {
}
@Test
void testXMLRecordFactoryWithValidatedProject() throws IOException, DocumentException {
public void testXMLRecordFactoryWithValidatedProject() throws IOException, DocumentException {
ContextMapper contextMapper = new ContextMapper();
final ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION,
otherDsTypeId);
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
Publication p = OBJECT_MAPPER
final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
Project pj = OBJECT_MAPPER
final Project pj = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class);
Relation rel = OBJECT_MAPPER
.readValue(
(IOUtils.toString(getClass().getResourceAsStream("relToValidatedProject.json"))), Relation.class);
RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class);
List<RelatedEntityWrapper> links = Lists.newArrayList();
RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject);
final Relation rel = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("relToValidatedProject.json")), Relation.class);
final RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class);
final List<RelatedEntityWrapper> links = Lists.newArrayList();
final RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject);
links.add(rew);
JoinedEntity je = new JoinedEntity<>(p);
final JoinedEntity je = new JoinedEntity<>(p);
je.setLinks(links);
String xml = xmlRecordFactory.build(je);
final String xml = xmlRecordFactory.build(je);
assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml));
final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
System.out.println(doc.asXML());
Assertions.assertEquals("2021-01-01", doc.valueOf("//validated/@date"));
}
@Test
void testXMLRecordFactoryWithNonValidatedProject() throws IOException, DocumentException {
public void testXMLRecordFactoryWithNonValidatedProject() throws IOException, DocumentException {
ContextMapper contextMapper = new ContextMapper();
final ContextMapper contextMapper = new ContextMapper();
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION,
otherDsTypeId);
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
Publication p = OBJECT_MAPPER
final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
Project pj = OBJECT_MAPPER
final Project pj = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("project.json")), Project.class);
Relation rel = OBJECT_MAPPER
.readValue((IOUtils.toString(getClass().getResourceAsStream("relToProject.json"))), Relation.class);
RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class);
List<RelatedEntityWrapper> links = Lists.newArrayList();
RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject);
final Relation rel = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("relToProject.json")), Relation.class);
final RelatedEntity relatedProject = CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class);
final List<RelatedEntityWrapper> links = Lists.newArrayList();
final RelatedEntityWrapper rew = new RelatedEntityWrapper(rel, relatedProject);
links.add(rew);
JoinedEntity je = new JoinedEntity<>(p);
final JoinedEntity je = new JoinedEntity<>(p);
je.setLinks(links);
String xml = xmlRecordFactory.build(je);
final String xml = xmlRecordFactory.build(je);
assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml));
final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
System.out.println(doc.asXML());
assertEquals("", doc.valueOf("//rel/validated"));
}
@Test
void testEnermapsRecord() throws IOException, DocumentException, SAXException {
String contextmap = "<entries><entry id=\"enermaps\" label=\"Energy Research\" name=\"context\" type=\"community\"/>"
+
"<entry id=\"enermaps::selection\" label=\"Featured dataset\" name=\"category\"/>" +
"<entry id=\"enermaps::selection::tgs00004\" label=\"Dataset title\" name=\"concept\"/>" +
"</entries>";
ContextMapper contextMapper = ContextMapper.fromXml(contextmap);
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.SCHEMA_LOCATION,
otherDsTypeId);
Dataset d = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("enermaps.json")), Dataset.class);
JoinedEntity je = new JoinedEntity<>(d);
String xml = xmlRecordFactory.build(je);
assertNotNull(xml);
Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
System.out.println(doc.asXML());
assertEquals("enermaps::selection::tgs00004", doc.valueOf("//concept/@id"));
}
}

17
pom.xml
View File

@ -205,11 +205,6 @@
<artifactId>dateparser</artifactId>
<version>1.0.7</version>
</dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
<version>0.0.7</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
@ -519,16 +514,6 @@
<version>${common.text.version}</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>5.5</version>
</dependency>
<dependency>
<groupId>io.github.classgraph</groupId>
<artifactId>classgraph</artifactId>
<version>4.8.71</version>
</dependency>
</dependencies>
</dependencyManagement>
@ -751,7 +736,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.7.15]</dhp-schemas.version>
<dhp-schemas.version>[2.7.15-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>