enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
25 changed files with 256 additions and 193 deletions
Showing only changes of commit b57e8ba374 - Show all commits

View File

@ -57,7 +57,6 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
@ -87,25 +86,32 @@ public class GenerateEventsApplication {
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy =
new EnrichMissingPublicationIsReferencedBy();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo =
new EnrichMissingPublicationIsSupplementedTo();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy =
new EnrichMissingPublicationIsSupplementedBy();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo =
new EnrichMissingDatasetIsRelatedTo();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy =
new EnrichMissingDatasetIsReferencedBy();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences =
new EnrichMissingDatasetReferences();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo =
new EnrichMissingDatasetIsSupplementedTo();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy =
new EnrichMissingDatasetIsSupplementedBy();
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateEventsApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
.toString(GenerateEventsApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
@ -128,10 +134,13 @@ public class GenerateEventsApplication {
final JavaRDD<Event> eventsRdd = sc.emptyRDD();
eventsRdd.union(generateSimpleEvents(spark, graphPath, Publication.class));
eventsRdd.union(generateSimpleEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class));
eventsRdd.union(generateSimpleEvents(spark, graphPath, Software.class));
eventsRdd.union(generateSimpleEvents(spark, graphPath, OtherResearchProduct.class));
for (final Class<? extends Result> r1 : BrokerConstants.RESULT_CLASSES) {
eventsRdd.union(generateSimpleEvents(spark, graphPath, r1));
for (final Class<? extends Result> r2 : BrokerConstants.RESULT_CLASSES) {
eventsRdd.union(generateRelationEvents(spark, graphPath, r1, r2));
}
}
eventsRdd.saveAsTextFile(eventsPath, GzipCodec.class);
});
@ -146,9 +155,8 @@ public class GenerateEventsApplication {
final String graphPath,
final Class<R> resultClazz) {
final Dataset<R> results = readPath(
spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
.filter(r -> r.getDataInfo().getDeletedbyinference());
final Dataset<R> results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
.filter(r -> r.getDataInfo().getDeletedbyinference());
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
@ -169,6 +177,14 @@ public class GenerateEventsApplication {
}
private static <SRC extends Result, TRG extends Result> JavaRDD<Event> generateRelationEvents(final SparkSession spark,
final String graphPath,
final Class<SRC> sourceClass,
final Class<TRG> targetClass) {
// TODO Auto-generated method stub
return null;
}
private List<Event> generateSimpleEvents(final Collection<Result> children) {
final List<UpdateInfo<?>> list = new ArrayList<>();

View File

@ -1,9 +1,21 @@
package eu.dnetlib.dhp.broker.oa.util;
import java.util.Arrays;
import java.util.List;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
public class BrokerConstants {
public final static String OPEN_ACCESS = "OPEN";
public final static String IS_MERGED_IN_CLASS = "isMergedIn";
public static final String OPEN_ACCESS = "OPEN";
public static final String IS_MERGED_IN_CLASS = "isMergedIn";
public static final List<Class<? extends Result>> RESULT_CLASSES =
Arrays.asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class);
}

View File

@ -39,9 +39,6 @@ import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class GenerateEntitiesApplication {
@ -66,15 +63,15 @@ public class GenerateEntitiesApplication {
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String sourcePaths = parser.get("sourcePaths");
final String targetPath = parser.get("targetPath");
log.info("sourcePaths: {}", sourcePaths);
// final String dbUrl = parser.get("postgresUrl");
// final String dbUser = parser.get("postgresUser");
// final String dbPassword = parser.get("postgresPassword");
final String targetPath = parser.get("targetPath");
log.info("targetPath: {}", targetPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final VocabularyGroup vocs = loadVocsFromIS(isLookupUrl); // MAP: vocId -> voc
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -165,35 +162,6 @@ public class GenerateEntitiesApplication {
}
}
private static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String xquery = IOUtils
.toString(
GenerateEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery"));
final VocabularyGroup vocs = new VocabularyGroup();
for (final String s : isLookUpService.quickSearchProfile(xquery)) {
final String[] arr = s.split("@=@");
if (arr.length == 4) {
final String vocId = arr[0].trim();
final String vocName = arr[1].trim();
final String termId = arr[2].trim();
final String termName = arr[3].trim();
if (!vocs.vocabularyExists(vocId)) {
vocs.addVocabulary(vocId, vocName);
}
vocs.addTerm(vocId, termId, termName);
}
}
return vocs;
}
private static Oaf convertFromJson(final String s, final Class<? extends Oaf> clazz) {
try {
return OBJECT_MAPPER.readValue(s, clazz);

View File

@ -10,7 +10,28 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import java.io.Closeable;
import java.io.IOException;
@ -26,12 +47,13 @@ import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
@ -52,7 +74,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
private static final Logger log = LoggerFactory.getLogger(MigrateDbEntitiesApplication.class);
public static final String SOURCE_TYPE = "source_type";
public static final String TARGET_TYPE = "target_type";
@ -61,6 +83,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
private final long lastUpdateTimestamp;
private final VocabularyGroup vocs;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
@ -71,15 +95,25 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
log.info("postgresUrl: {}", dbUrl);
final String dbUser = parser.get("postgresUser");
log.info("postgresUser: {}", dbUser);
final String dbPassword = parser.get("postgresPassword");
log.info("postgresPassword: xxx");
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims");
log.info("processClaims: {}", processClaims);
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser,
dbPassword)) {
dbPassword, isLookupUrl)) {
if (processClaims) {
log.info("Processing claims...");
smdbe.execute("queryClaims.sql", smdbe::processClaims);
@ -103,18 +137,21 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
}
}
protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST
protected MigrateDbEntitiesApplication(final VocabularyGroup vocs) { // ONLY FOR UNIT TEST
super();
this.dbClient = null;
this.lastUpdateTimestamp = new Date().getTime();
this.vocs = vocs;
}
public MigrateDbEntitiesApplication(
final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword)
final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword,
final String isLookupUrl)
throws Exception {
super(hdfsPath);
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.lastUpdateTimestamp = new Date().getTime();
this.vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl);
}
public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer)
@ -453,12 +490,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final Boolean inferred = rs.getBoolean("inferred");
final String trust = rs.getString("trust");
return dataInfo(
deletedbyinference,
inferenceprovenance,
inferred,
false,
ENTITYREGISTRY_PROVENANCE_ACTION,
trust);
deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
}
private Qualifier prepareQualifierSplitting(final String s) {
@ -466,7 +498,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return null;
}
final String[] arr = s.split("@@@");
return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null;
return arr.length == 2 ? vocs.getTermAsQualifier(arr[1], arr[0]) : null;
}
private List<Field<String>> prepareListFields(final Array array, final DataInfo info) {
@ -485,8 +517,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
if (parts.length == 2) {
final String value = parts[0];
final String[] arr = parts[1].split("@@@");
if (arr.length == 4) {
return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo);
if (arr.length == 2) {
return structuredProperty(value, vocs.getTermAsQualifier(arr[1], arr[0]), dataInfo);
}
}
return null;

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
public class Vocabulary {
public class Vocabulary implements Serializable {
private final String id;
private final String name;

View File

@ -1,12 +1,50 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
public class VocabularyGroup {
import eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class VocabularyGroup implements Serializable {
public static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String xquery = IOUtils
.toString(
GenerateEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery"));
final VocabularyGroup vocs = new VocabularyGroup();
for (final String s : isLookUpService.quickSearchProfile(xquery)) {
final String[] arr = s.split("@=@");
if (arr.length == 4) {
final String vocId = arr[0].trim();
final String vocName = arr[1].trim();
final String termId = arr[2].trim();
final String termName = arr[3].trim();
if (!vocs.vocabularyExists(vocId)) {
vocs.addVocabulary(vocId, vocName);
}
vocs.addTerm(vocId, termId, termName);
}
}
return vocs;
}
private final Map<String, Vocabulary> vocs = new HashMap<>();
@ -29,7 +67,9 @@ public class VocabularyGroup {
}
public Qualifier getTermAsQualifier(final String vocId, final String id) {
if (termExists(vocId, id)) {
if (StringUtils.isBlank(id)) {
return OafMapperUtils.qualifier("UNKNOWN", "UNKNOWN", vocId, vocId);
} else if (termExists(vocId, id)) {
final Vocabulary v = vocs.get(vocId.toLowerCase());
final VocabularyTerm t = v.getTerm(id);
return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName());

View File

@ -1,7 +1,9 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
public class VocabularyTerm {
import java.io.Serializable;
public class VocabularyTerm implements Serializable {
private final String id;
private final String name;

View File

@ -18,8 +18,8 @@
"paramRequired": true
},
{
"paramName": "islookup",
"paramLongName": "islookup",
"paramName": "isu",
"paramLongName": "isLookupUrl",
"paramDescription": "the url of the ISLookupService",
"paramRequired": true
}

View File

@ -28,5 +28,11 @@
"paramLongName": "action",
"paramDescription": "process claims",
"paramRequired": false
},
{
"paramName": "isu",
"paramLongName": "isLookupUrl",
"paramDescription": "the url of the ISLookupService",
"paramRequired": true
}
]

View File

@ -123,6 +123,7 @@
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--action</arg><arg>claims</arg>
</java>
<ok to="ImportODF_claims"/>
@ -173,6 +174,7 @@
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</java>
<ok to="ImportODF"/>
<error to="Kill"/>
@ -237,7 +239,7 @@
</spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg>
<arg>--islookup</arg><arg>${isLookupUrl}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="GenerateGraph_claims"/>
<error to="Kill"/>
@ -284,7 +286,7 @@
</spark-opts>
<arg>--sourcePaths</arg><arg>${contentPath}/db_records,${contentPath}/oaf_records,${contentPath}/odf_records</arg>
<arg>--targetPath</arg><arg>${workingDir}/entities</arg>
<arg>--islookup</arg><arg>${isLookupUrl}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="GenerateGraph"/>
<error to="Kill"/>

View File

@ -16,6 +16,10 @@
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>sparkDriverMemory</name>
@ -88,6 +92,7 @@
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</java>
<ok to="ImportDB_claims"/>
<error to="Kill"/>
@ -103,6 +108,7 @@
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--action</arg><arg>claims</arg>
</java>
<ok to="End"/>

View File

@ -24,6 +24,10 @@
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -62,6 +66,7 @@
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
<arg>-islookup</arg><arg>${isLookupUrl}</arg>
</java>
<ok to="ImportODF"/>
<error to="Kill"/>

View File

@ -1,17 +1,16 @@
SELECT
dor.datasource AS datasource,
dor.organization AS organization,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics,
d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
dor.datasource AS datasource,
dor.organization AS organization,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
'providedBy@@@dnet:datasources_organizations_typologies' AS semantics,
d.provenanceaction || '@@@dnet:provenanceActions' AS provenanceaction
FROM dsm_datasource_organization dor
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom)

View File

@ -7,36 +7,36 @@ SELECT
CASE
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1'])
THEN
'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'openaire-cris_1.1@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire4.0'])
THEN
'openaire4.0@@@OpenAIRE 4.0@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'openaire4.0@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
THEN
'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'driver-openaire2.0@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver'])
THEN
'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'driver@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0'])
THEN
'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'openaire2.0@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0'])
THEN
'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'openaire3.0@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data'])
THEN
'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'openaire2.0_data@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native'])
THEN
'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'native@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy'])
THEN
'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'hostedBy@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
THEN
'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'notCompatible@@@dnet:datasourceCompatibilityLevel'
ELSE
'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
'UNKNOWN@@@dnet:datasourceCompatibilityLevel'
END AS openairecompatibility,
d.websiteurl AS websiteurl,
d.logourl AS logourl,
@ -47,7 +47,7 @@ SELECT
NULL AS odnumberofitems,
NULL AS odnumberofitemsdate,
(SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies')
(SELECT array_agg(s|| '###keywords@@@dnet:subject_classification_typologies')
FROM UNNEST(
ARRAY(
SELECT trim(s)
@ -83,32 +83,9 @@ SELECT
ARRAY[]::text[] AS policies,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
d.typology || '@@@' || CASE
WHEN (d.typology = 'crissystem') THEN 'CRIS System'
WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository'
WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator'
WHEN (d.typology = 'infospace') THEN 'Information Space'
WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository'
WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator'
WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal'
WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher'
WHEN (d.typology = 'pubsrepository::mock') THEN 'Other'
WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue'
WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository'
WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator'
WHEN (d.typology = 'entityregistry') THEN 'Registry'
WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure'
WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository'
WHEN (d.typology = 'websource') THEN 'Web Source'
WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database'
WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories'
WHEN (d.typology = 'softwarerepository') THEN 'Software Repository'
WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator'
WHEN (d.typology = 'orprepository') THEN 'Repository'
ELSE 'Other'
END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
CONCAT(d.issn, ' @@@ ', d.eissn, ' @@@ ', d.lissn) AS journal
d.typology||'@@@dnet:datasource_typologies' AS datasourcetype,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
d.issn || ' @@@ ' || d.eissn || ' @@@ ' || d.lissn AS journal
FROM dsm_datasources d

View File

@ -22,13 +22,12 @@ SELECT
'' AS inferenceprovenance,
d.id AS collectedfromid,
d.officialname AS collectedfromname,
o.country || '@@@' || COALESCE(cntr.name,o.country) || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
ARRAY[]::text[] AS pid
FROM dsm_organizations o
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)
LEFT OUTER JOIN class cntr ON (cntr.code = o.country)

View File

@ -11,8 +11,8 @@ SELECT
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM organizations o
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
@ -40,8 +40,8 @@ SELECT
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@' || o.country || '@@@dnet:countries@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)

View File

@ -11,8 +11,8 @@ SELECT
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction
po.semanticclass || '@@@dnet:project_organization_relations' AS semantics,
'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction
FROM project_organization po
LEFT OUTER JOIN projects p ON (p.id = po.project)

View File

@ -31,17 +31,14 @@ SELECT
p.fundedamount AS fundedamount,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
p.contracttype || '@@@' || p.contracttypename || '@@@' || p.contracttypescheme || '@@@' || p.contracttypescheme AS contracttype,
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
p.contracttype || '@@@' || p.contracttypescheme AS contracttype,
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
@ -53,9 +50,6 @@ SELECT
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
GROUP BY
p.id,
p.code,
@ -85,5 +79,6 @@ SELECT
p.fundedamount,
dc.id,
dc.officialname,
pac.code, pac.name, pas.code, pas.name,
p.contracttype , p.contracttypename, p.contracttypescheme;
p.contracttype,
p.contracttypescheme;

View File

@ -28,18 +28,15 @@ SELECT
p.summary AS summary,
p.currency AS currency,
p.totalcost AS totalcost,
p.fundedamount AS fundedamount,
p.fundedamount AS fundedamount,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
p.contracttypeclass || '@@@' || p.contracttypescheme AS contracttype,
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
@ -51,12 +48,6 @@ SELECT
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
GROUP BY
p.id,
p.code,
@ -85,6 +76,6 @@ SELECT
p.totalcost,
p.fundedamount,
dc.id,
dc.officialname,
pac.code, pac.name, pas.code, pas.name,
ctc.code, ctc.name, cts.code, cts.name;
dc.officialname

View File

@ -4,6 +4,8 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.sql.Array;
@ -25,6 +27,8 @@ import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
@ -40,9 +44,22 @@ public class MigrateDbEntitiesApplicationTest {
@Mock
private ResultSet rs;
@Mock
private VocabularyGroup vocs;
@BeforeEach
public void setUp() {
this.app = new MigrateDbEntitiesApplication();
lenient()
.when(vocs.getTermAsQualifier(anyString(), anyString()))
.thenAnswer(
invocation -> OafMapperUtils
.qualifier(
invocation.getArgument(1), invocation.getArgument(1), invocation.getArgument(0),
invocation.getArgument(0)));
lenient().when(vocs.termExists(anyString(), anyString())).thenReturn(true);
this.app = new MigrateDbEntitiesApplication(vocs);
}
@Test
@ -61,8 +78,7 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields));
assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields));
assertEquals(
ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
assertEquals(ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
@ -78,8 +94,7 @@ public class MigrateDbEntitiesApplicationTest {
assertValidId(p.getCollectedfrom().get(0).getKey());
assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields));
assertEquals(p.getTitle().getValue(), getValueAsString("title", fields));
assertEquals(
p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
assertEquals(p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test
@ -99,13 +114,10 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields));
assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]);
assertEquals(
o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]);
assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]);
assertEquals(
o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]);
assertEquals(
o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
assertEquals(o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[0]);
assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[1]);
assertEquals(o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[1]);
assertEquals(o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields));
}
@Test

View File

@ -52,7 +52,7 @@
{
"field": "semantics",
"type": "not_used",
"value": "providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies"
"value": "providedBy@@@dnet:datasources_organizations_typologies"
},
{
"field": "provenanceaction",

View File

@ -30,7 +30,7 @@
{
"field": "openairecompatibility",
"type": "string",
"value": "hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel"
"value": "hostedBy@@@dnet:datasourceCompatibilityLevel"
},
{
"field": "websiteurl",
@ -219,16 +219,16 @@
{
"field": "datasourcetype",
"type": "string",
"value": "pubsrepository::journal@@@Journal@@@dnet:datasource_typologies@@@dnet:datasource_typologies"
"value": "pubsrepository::journal@@@dnet:datasource_typologies"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
},
{
"field": "journal",
"type": "string",
"value": "2579-5449@@@2597-6540@@@"
"value": "2579-5449 @@@ 2597-6540 @@@ "
}
]

View File

@ -117,11 +117,11 @@
{
"field": "country",
"type": "string",
"value": "US@@@US@@@dnet:countries@@@dnet:countries"
"value": "US@@@dnet:countries"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
}
]

View File

@ -62,11 +62,11 @@
{
"field": "semantics",
"type": "not_used",
"value": "coordinator@@@coordinator@@@dnet:project_organization_relations@@@dnet:project_organization_relations"
"value": "coordinator@@@dnet:project_organization_relations"
},
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions"
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenance_actions"
}
]

View File

@ -167,7 +167,7 @@
{
"field": "provenanceaction",
"type": "not_used",
"value": "sysimport:crosswalk:entityregistry@@@Harvested@@@dnet:provenanceActions@@@dnet:provenanceActions"
"value": "sysimport:crosswalk:entityregistry@@@dnet:provenanceActions"
},
{
"field": "pid",