diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index accc06d12..e32dd10fa 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -13,6 +13,7 @@ public class ModelConstants { public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date"; public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource"; public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; + public static final String DNET_COUNTRY_TYPE = "dnet:countries"; public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository"; public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry"; @@ -49,6 +50,13 @@ public class ModelConstants { public static final String HAS_PARTICIPANT = "hasParticipant"; public static final String IS_PARTICIPANT = "isParticipant"; + public static final String RESULT_ORGANIZATION = "resultOrganization"; + public static final String AFFILIATION = "affiliation"; + public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf"; + public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution"; + + public static final String MERGES = "merges"; + public static final String UNKNOWN = "UNKNOWN"; public static final String NOT_AVAILABLE = "not available"; diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java index 7b0b9a1e2..9ee7c2deb 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java @@ -1,9 +1,15 @@ + package eu.dnetlib.dhp.schema.common; +import static com.google.common.base.Preconditions.checkArgument; + import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.function.Function; +import org.apache.commons.lang3.StringUtils; + import com.google.common.collect.Maps; import eu.dnetlib.dhp.schema.oaf.*; @@ -65,216 +71,216 @@ public class ModelSupport { static { relationInverseMap - .put( - "personResult_authorship_isAuthorOf", new RelationInverse() - .setRelation("isAuthorOf") - .setInverse("hasAuthor") - .setRelType("personResult") - .setSubReltype("authorship")); + .put( + "personResult_authorship_isAuthorOf", new RelationInverse() + .setRelation("isAuthorOf") + .setInverse("hasAuthor") + .setRelType("personResult") + .setSubReltype("authorship")); relationInverseMap - .put( - "personResult_authorship_hasAuthor", new RelationInverse() - .setInverse("isAuthorOf") - .setRelation("hasAuthor") - .setRelType("personResult") - .setSubReltype("authorship")); + .put( + "personResult_authorship_hasAuthor", new RelationInverse() + .setInverse("isAuthorOf") + .setRelation("hasAuthor") + .setRelType("personResult") + .setSubReltype("authorship")); relationInverseMap - .put( - "projectOrganization_participation_isParticipant", new RelationInverse() - .setRelation("isParticipant") - .setInverse("hasParticipant") - .setRelType("projectOrganization") - .setSubReltype("participation")); + .put( + "projectOrganization_participation_isParticipant", new RelationInverse() + .setRelation("isParticipant") + .setInverse("hasParticipant") + .setRelType("projectOrganization") + .setSubReltype("participation")); relationInverseMap - .put( - "projectOrganization_participation_hasParticipant", new RelationInverse() - .setInverse("isParticipant") - .setRelation("hasParticipant") - .setRelType("projectOrganization") - .setSubReltype("participation")); + .put( + "projectOrganization_participation_hasParticipant", new RelationInverse() + .setInverse("isParticipant") + .setRelation("hasParticipant") + .setRelType("projectOrganization") + .setSubReltype("participation")); relationInverseMap - .put( - "resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse() - .setRelation("hasAuthorInstitution") - .setInverse("isAuthorInstitutionOf") - .setRelType("resultOrganization") - .setSubReltype("affiliation")); + .put( + "resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse() + .setRelation("hasAuthorInstitution") + .setInverse("isAuthorInstitutionOf") + .setRelType("resultOrganization") + .setSubReltype("affiliation")); relationInverseMap - .put( - "resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse() - .setInverse("hasAuthorInstitution") - .setRelation("isAuthorInstitutionOf") - .setRelType("resultOrganization") - .setSubReltype("affiliation")); + .put( + "resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse() + .setInverse("hasAuthorInstitution") + .setRelation("isAuthorInstitutionOf") + .setRelType("resultOrganization") + .setSubReltype("affiliation")); relationInverseMap - .put( - "organizationOrganization_dedup_merges", new RelationInverse() - .setRelation("merges") - .setInverse("isMergedIn") - .setRelType("organizationOrganization") - .setSubReltype("dedup")); + .put( + "organizationOrganization_dedup_merges", new RelationInverse() + .setRelation("merges") + .setInverse("isMergedIn") + .setRelType("organizationOrganization") + .setSubReltype("dedup")); relationInverseMap - .put( - "organizationOrganization_dedup_isMergedIn", new RelationInverse() - .setInverse("merges") - .setRelation("isMergedIn") - .setRelType("organizationOrganization") - .setSubReltype("dedup")); + .put( + "organizationOrganization_dedup_isMergedIn", new RelationInverse() + .setInverse("merges") + .setRelation("isMergedIn") + .setRelType("organizationOrganization") + .setSubReltype("dedup")); relationInverseMap - .put( - "organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse() - .setInverse("isSimilarTo") - .setRelation("isSimilarTo") - .setRelType("organizationOrganization") - .setSubReltype("dedupSimilarity")); + .put( + "organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse() + .setInverse("isSimilarTo") + .setRelation("isSimilarTo") + .setRelType("organizationOrganization") + .setSubReltype("dedupSimilarity")); relationInverseMap - .put( - "resultProject_outcome_isProducedBy", new RelationInverse() - .setRelation("isProducedBy") - .setInverse("produces") - .setRelType("resultProject") - .setSubReltype("outcome")); + .put( + "resultProject_outcome_isProducedBy", new RelationInverse() + .setRelation("isProducedBy") + .setInverse("produces") + .setRelType("resultProject") + .setSubReltype("outcome")); relationInverseMap - .put( - "resultProject_outcome_produces", new RelationInverse() - .setInverse("isProducedBy") - .setRelation("produces") - .setRelType("resultProject") - .setSubReltype("outcome")); + .put( + "resultProject_outcome_produces", new RelationInverse() + .setInverse("isProducedBy") + .setRelation("produces") + .setRelType("resultProject") + .setSubReltype("outcome")); relationInverseMap - .put( - "projectPerson_contactPerson_isContact", new RelationInverse() - .setRelation("isContact") - .setInverse("hasContact") - .setRelType("projectPerson") - .setSubReltype("contactPerson")); + .put( + "projectPerson_contactPerson_isContact", new RelationInverse() + .setRelation("isContact") + .setInverse("hasContact") + .setRelType("projectPerson") + .setSubReltype("contactPerson")); relationInverseMap - .put( - "projectPerson_contactPerson_hasContact", new RelationInverse() - .setInverse("isContact") - .setRelation("hasContact") - .setRelType("personPerson") - .setSubReltype("coAuthorship")); + .put( + "projectPerson_contactPerson_hasContact", new RelationInverse() + .setInverse("isContact") + .setRelation("hasContact") + .setRelType("personPerson") + .setSubReltype("coAuthorship")); relationInverseMap - .put( - "personPerson_coAuthorship_isCoauthorOf", new RelationInverse() - .setInverse("isCoAuthorOf") - .setRelation("isCoAuthorOf") - .setRelType("personPerson") - .setSubReltype("coAuthorship")); + .put( + "personPerson_coAuthorship_isCoauthorOf", new RelationInverse() + .setInverse("isCoAuthorOf") + .setRelation("isCoAuthorOf") + .setRelType("personPerson") + .setSubReltype("coAuthorship")); relationInverseMap - .put( - "personPerson_dedup_merges", new RelationInverse() - .setInverse("isMergedIn") - .setRelation("merges") - .setRelType("personPerson") - .setSubReltype("dedup")); + .put( + "personPerson_dedup_merges", new RelationInverse() + .setInverse("isMergedIn") + .setRelation("merges") + .setRelType("personPerson") + .setSubReltype("dedup")); relationInverseMap - .put( - "personPerson_dedup_isMergedIn", new RelationInverse() - .setInverse("merges") - .setRelation("isMergedIn") - .setRelType("personPerson") - .setSubReltype("dedup")); + .put( + "personPerson_dedup_isMergedIn", new RelationInverse() + .setInverse("merges") + .setRelation("isMergedIn") + .setRelType("personPerson") + .setSubReltype("dedup")); relationInverseMap - .put( - "personPerson_dedupSimilarity_isSimilarTo", new RelationInverse() - .setInverse("isSimilarTo") - .setRelation("isSimilarTo") - .setRelType("personPerson") - .setSubReltype("dedupSimilarity")); + .put( + "personPerson_dedupSimilarity_isSimilarTo", new RelationInverse() + .setInverse("isSimilarTo") + .setRelation("isSimilarTo") + .setRelType("personPerson") + .setSubReltype("dedupSimilarity")); relationInverseMap - .put( - "datasourceOrganization_provision_isProvidedBy", new RelationInverse() - .setInverse("provides") - .setRelation("isProvidedBy") - .setRelType("datasourceOrganization") - .setSubReltype("provision")); + .put( + "datasourceOrganization_provision_isProvidedBy", new RelationInverse() + .setInverse("provides") + .setRelation("isProvidedBy") + .setRelType("datasourceOrganization") + .setSubReltype("provision")); relationInverseMap - .put( - "datasourceOrganization_provision_provides", new RelationInverse() - .setInverse("isProvidedBy") - .setRelation("provides") - .setRelType("datasourceOrganization") - .setSubReltype("provision")); + .put( + "datasourceOrganization_provision_provides", new RelationInverse() + .setInverse("isProvidedBy") + .setRelation("provides") + .setRelType("datasourceOrganization") + .setSubReltype("provision")); relationInverseMap - .put( - "resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse() - .setInverse("isAmongTopNSimilarDocuments") - .setRelation("hasAmongTopNSimilarDocuments") - .setRelType("resultResult") - .setSubReltype("similarity")); + .put( + "resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse() + .setInverse("isAmongTopNSimilarDocuments") + .setRelation("hasAmongTopNSimilarDocuments") + .setRelType("resultResult") + .setSubReltype("similarity")); relationInverseMap - .put( - "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() - .setInverse("hasAmongTopNSimilarDocuments") - .setRelation("isAmongTopNSimilarDocuments") - .setRelType("resultResult") - .setSubReltype("similarity")); + .put( + "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() + .setInverse("hasAmongTopNSimilarDocuments") + .setRelation("isAmongTopNSimilarDocuments") + .setRelType("resultResult") + .setSubReltype("similarity")); relationInverseMap - .put( - "resultResult_relationship_isRelatedTo", new RelationInverse() - .setInverse("isRelatedTo") - .setRelation("isRelatedTo") - .setRelType("resultResult") - .setSubReltype("relationship")); + .put( + "resultResult_relationship_isRelatedTo", new RelationInverse() + .setInverse("isRelatedTo") + .setRelation("isRelatedTo") + .setRelType("resultResult") + .setSubReltype("relationship")); relationInverseMap - .put( - "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() - .setInverse("hasAmongTopNSimilarDocuments") - .setRelation("isAmongTopNSimilarDocuments") - .setRelType("resultResult") - .setSubReltype("similarity")); + .put( + "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() + .setInverse("hasAmongTopNSimilarDocuments") + .setRelation("isAmongTopNSimilarDocuments") + .setRelType("resultResult") + .setSubReltype("similarity")); relationInverseMap - .put( - "resultResult_supplement_isSupplementTo", new RelationInverse() - .setInverse("isSupplementedBy") - .setRelation("isSupplementTo") - .setRelType("resultResult") - .setSubReltype("supplement")); + .put( + "resultResult_supplement_isSupplementTo", new RelationInverse() + .setInverse("isSupplementedBy") + .setRelation("isSupplementTo") + .setRelType("resultResult") + .setSubReltype("supplement")); relationInverseMap - .put( - "resultResult_supplement_isSupplementedBy", new RelationInverse() - .setInverse("isSupplementTo") - .setRelation("isSupplementedBy") - .setRelType("resultResult") - .setSubReltype("supplement")); + .put( + "resultResult_supplement_isSupplementedBy", new RelationInverse() + .setInverse("isSupplementTo") + .setRelation("isSupplementedBy") + .setRelType("resultResult") + .setSubReltype("supplement")); relationInverseMap - .put( - "resultResult_part_isPartOf", new RelationInverse() - .setInverse("hasPart") - .setRelation("isPartOf") - .setRelType("resultResult") - .setSubReltype("part")); + .put( + "resultResult_part_isPartOf", new RelationInverse() + .setInverse("hasPart") + .setRelation("isPartOf") + .setRelType("resultResult") + .setSubReltype("part")); relationInverseMap - .put( - "resultResult_part_hasPart", new RelationInverse() - .setInverse("isPartOf") - .setRelation("hasPart") - .setRelType("resultResult") - .setSubReltype("part")); + .put( + "resultResult_part_hasPart", new RelationInverse() + .setInverse("isPartOf") + .setRelation("hasPart") + .setRelType("resultResult") + .setSubReltype("part")); relationInverseMap - .put( - "resultResult_dedup_merges", new RelationInverse() - .setInverse("isMergedIn") - .setRelation("merges") - .setRelType("resultResult") - .setSubReltype("dedup")); + .put( + "resultResult_dedup_merges", new RelationInverse() + .setInverse("isMergedIn") + .setRelation("merges") + .setRelType("resultResult") + .setSubReltype("dedup")); relationInverseMap - .put( - "resultResult_dedup_isMergedIn", new RelationInverse() - .setInverse("merges") - .setRelation("isMergedIn") - .setRelType("resultResult") - .setSubReltype("dedup")); + .put( + "resultResult_dedup_isMergedIn", new RelationInverse() + .setInverse("merges") + .setRelation("isMergedIn") + .setRelType("resultResult") + .setSubReltype("dedup")); relationInverseMap - .put( - "resultResult_dedupSimilarity_isSimilarTo", new RelationInverse() - .setInverse("isSimilarTo") - .setRelation("isSimilarTo") - .setRelType("resultResult") - .setSubReltype("dedupSimilarity")); + .put( + "resultResult_dedupSimilarity_isSimilarTo", new RelationInverse() + .setInverse("isSimilarTo") + .setRelation("isSimilarTo") + .setRelType("resultResult") + .setSubReltype("dedupSimilarity")); } @@ -293,7 +299,7 @@ public class ModelSupport { * @return True if X is a subclass of Y */ public static Boolean isSubClass( - X subClazzObject, Y superClazzObject) { + X subClazzObject, Y superClazzObject) { return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); } @@ -307,7 +313,7 @@ public class ModelSupport { * @return True if X is a subclass of Y */ public static Boolean isSubClass( - X subClazzObject, Class superClazz) { + X subClazzObject, Class superClazz) { return isSubClass(subClazzObject.getClass(), superClazz); } @@ -321,7 +327,7 @@ public class ModelSupport { * @return True if X is a subclass of Y */ public static Boolean isSubClass( - Class subClazz, Class superClazz) { + Class subClazz, Class superClazz) { return superClazz.isAssignableFrom(subClazz); } @@ -333,32 +339,32 @@ public class ModelSupport { */ public static Class[] getOafModelClasses() { return new Class[] { - Author.class, - Context.class, - Country.class, - DataInfo.class, - Dataset.class, - Datasource.class, - ExternalReference.class, - ExtraInfo.class, - Field.class, - GeoLocation.class, - Instance.class, - Journal.class, - KeyValue.class, - Oaf.class, - OafEntity.class, - OAIProvenance.class, - Organization.class, - OriginDescription.class, - OtherResearchProduct.class, - Project.class, - Publication.class, - Qualifier.class, - Relation.class, - Result.class, - Software.class, - StructuredProperty.class + Author.class, + Context.class, + Country.class, + DataInfo.class, + Dataset.class, + Datasource.class, + ExternalReference.class, + ExtraInfo.class, + Field.class, + GeoLocation.class, + Instance.class, + Journal.class, + KeyValue.class, + Oaf.class, + OafEntity.class, + OAIProvenance.class, + Organization.class, + OriginDescription.class, + OtherResearchProduct.class, + Project.class, + Publication.class, + Qualifier.class, + Relation.class, + Result.class, + Software.class, + StructuredProperty.class }; } @@ -372,10 +378,25 @@ public class ModelSupport { public static String getScheme(final String sourceType, final String targetType) { return String - .format( - schemeTemplate, - entityMapping.get(EntityType.valueOf(sourceType)).name(), - entityMapping.get(EntityType.valueOf(targetType)).name()); + .format( + schemeTemplate, + entityMapping.get(EntityType.valueOf(sourceType)).name(), + entityMapping.get(EntityType.valueOf(targetType)).name()); + } + + public static String tableIdentifier(String dbName, String tableName) { + + checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty"); + checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty"); + + return String.format("%s.%s", dbName, tableName); + } + + public static String tableIdentifier(String dbName, Class clazz) { + + checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null"); + + return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase()); } public static Function idFn() { @@ -390,38 +411,38 @@ public class ModelSupport { private static String idFnForRelation(T t) { Relation r = (Relation) t; return Optional - .ofNullable(r.getSource()) - .map( - source -> Optional - .ofNullable(r.getTarget()) - .map( - target -> Optional - .ofNullable(r.getRelType()) - .map( - relType -> Optional - .ofNullable(r.getSubRelType()) - .map( - subRelType -> Optional - .ofNullable(r.getRelClass()) - .map( - relClass -> String - .join( - source, - target, - relType, - subRelType, - relClass)) - .orElse( - String - .join( - source, - target, - relType, - subRelType))) - .orElse(String.join(source, target, relType))) - .orElse(String.join(source, target))) - .orElse(source)) - .orElse(null); + .ofNullable(r.getSource()) + .map( + source -> Optional + .ofNullable(r.getTarget()) + .map( + target -> Optional + .ofNullable(r.getRelType()) + .map( + relType -> Optional + .ofNullable(r.getSubRelType()) + .map( + subRelType -> Optional + .ofNullable(r.getRelClass()) + .map( + relClass -> String + .join( + source, + target, + relType, + subRelType, + relClass)) + .orElse( + String + .join( + source, + target, + relType, + subRelType))) + .orElse(String.join(source, target, relType))) + .orElse(String.join(source, target))) + .orElse(source)) + .orElse(null); } private static String idFnForOafEntity(T t) { diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java index 90d573ac0..e55c0eb7b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java @@ -523,7 +523,9 @@ public class ProtoConverter implements Serializable { } private static Context mapContext(ResultProtos.Result.Context context) { - + if (context == null || StringUtils.isBlank(context.getId())) { + return null; + } final Context entity = new Context(); entity.setId(context.getId()); entity @@ -537,6 +539,10 @@ public class ProtoConverter implements Serializable { } public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { + if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) { + return null; + } + final KeyValue keyValue = new KeyValue(); keyValue.setKey(kv.getKey()); keyValue.setValue(kv.getValue()); @@ -575,6 +581,10 @@ public class ProtoConverter implements Serializable { } public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { + if (sp == null | StringUtils.isBlank(sp.getValue())) { + return null; + } + final StructuredProperty structuredProperty = new StructuredProperty(); structuredProperty.setValue(sp.getValue()); structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); @@ -611,6 +621,10 @@ public class ProtoConverter implements Serializable { } public static Field mapStringField(FieldTypeProtos.StringField s) { + if (s == null || StringUtils.isBlank(s.getValue())) { + return null; + } + final Field stringField = new Field<>(); stringField.setValue(s.getValue()); stringField.setDataInfo(mapDataInfo(s.getDataInfo())); @@ -618,19 +632,16 @@ public class ProtoConverter implements Serializable { } public static Field mapBoolField(FieldTypeProtos.BoolField b) { + if (b == null) { + return null; + } + final Field booleanField = new Field<>(); booleanField.setValue(b.getValue()); booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); return booleanField; } - public static Field mapIntField(FieldTypeProtos.IntField b) { - final Field entity = new Field<>(); - entity.setValue(b.getValue()); - entity.setDataInfo(mapDataInfo(b.getDataInfo())); - return entity; - } - public static Journal mapJournal(FieldTypeProtos.Journal j) { final Journal journal = new Journal(); journal.setConferencedate(j.getConferencedate()); diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java index d5c2b518a..b4bcc509e 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/PrepareMergedRelationJob.java @@ -18,6 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareMergedRelationJob { @@ -56,6 +57,7 @@ public class PrepareMergedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); selectMergesRelations( spark, inputPath, @@ -73,19 +75,6 @@ public class PrepareMergedRelationJob { .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); -// relation.createOrReplaceTempView("relation"); -// -// spark -// .sql( -// "Select * from relation " + -// "where relclass = 'merges' " + -// "and datainfo.deletedbyinference = false") -// .as(Encoders.bean(Relation.class)) -// .toJSON() -// .write() -// .mode(SaveMode.Overwrite) -// .option("compression", "gzip") -// .text(outputPath); } public static org.apache.spark.sql.Dataset readRelations( @@ -97,4 +86,9 @@ public class PrepareMergedRelationJob { (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class), Encoders.bean(Relation.class)); } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java index 704cab375..2caa66db4 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/ReadBlacklistFromDB.java @@ -65,8 +65,7 @@ public class ReadBlacklistFromDB implements Closeable { } } - public void execute(final String sql, final Function> producer) - throws Exception { + public void execute(final String sql, final Function> producer) throws Exception { final Consumer consumer = rs -> producer.apply(rs).forEach(r -> writeRelation(r)); diff --git a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java index c5104058c..92289ec2d 100644 --- a/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java +++ b/dhp-workflows/dhp-blacklist/src/main/java/eu/dnetlib/dhp/blacklist/SparkRemoveBlacklistedRelationJob.java @@ -18,6 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; @@ -62,6 +63,7 @@ public class SparkRemoveBlacklistedRelationJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); removeBlacklistedRelations( spark, blacklistPath, @@ -69,7 +71,6 @@ public class SparkRemoveBlacklistedRelationJob { outputPath, mergesPath); }); - } private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath, @@ -84,7 +85,7 @@ public class SparkRemoveBlacklistedRelationJob { .joinWith( mergesRelation, blackListed.col("source").equalTo(mergesRelation.col("target")), "left_outer") - .map(c -> { + .map((MapFunction, Relation>) c -> { Optional .ofNullable(c._2()) .ifPresent(mr -> c._1().setSource(mr.getSource())); @@ -95,7 +96,7 @@ public class SparkRemoveBlacklistedRelationJob { .joinWith( mergesRelation, dedupSource.col("target").equalTo(mergesRelation.col("target")), "left_outer") - .map(c -> { + .map((MapFunction, Relation>) c -> { Optional .ofNullable(c._2()) .ifPresent(mr -> c._1().setTarget(mr.getSource())); @@ -107,7 +108,6 @@ public class SparkRemoveBlacklistedRelationJob { .mode(SaveMode.Overwrite) .json(blacklistPath + "/deduped"); - inputRelation .joinWith( dedupBL, (inputRelation @@ -118,26 +118,23 @@ public class SparkRemoveBlacklistedRelationJob { .col("target") .equalTo(dedupBL.col("target")))), "left_outer") - .map(c -> { - Relation ir = c._1(); - Optional obl = Optional.ofNullable(c._2()); - if (obl.isPresent()) { - if (ir.equals(obl.get())) { - return null; + .map((MapFunction, Relation>) c -> { + Relation ir = c._1(); + Optional obl = Optional.ofNullable(c._2()); + if (obl.isPresent()) { + if (ir.equals(obl.get())) { + return null; + } } - } - return ir; - - }, Encoders.bean(Relation.class)) + return ir; + }, Encoders.bean(Relation.class)) .filter(Objects::nonNull) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); - } - public static org.apache.spark.sql.Dataset readRelations( SparkSession spark, String inputPath) { return spark @@ -148,4 +145,8 @@ public class SparkRemoveBlacklistedRelationJob { Encoders.bean(Relation.class)); } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + } diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml index b98001662..dd7827da4 100644 --- a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml @@ -1,45 +1,138 @@ - - - - postgresURL - the url of the postgress server to query - - - postgresUser - the username to access the postgres db - - - postgresPassword - the postgres password - - - sourcePath - the source path - - - outputPath - the path were to store the graph without the blacklisted relations - - - + + + + postgresURL + the url of the postgress server to query + + + postgresUser + the username to access the postgres db + + + postgresPassword + the postgres password + + + sourcePath + the source path + + + outputPath + the graph output path + + - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + - + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + - + + + + + + + + + + + + + + ${nameNode}/${sourcePath}/publication + ${nameNode}/${outputPath}/publication + + + + + + + + ${nameNode}/${sourcePath}/dataset + ${nameNode}/${outputPath}/dataset + + + + + + + + ${nameNode}/${sourcePath}/otherresearchproduct + ${nameNode}/${outputPath}/otherresearchproduct + + + + + + + + ${nameNode}/${sourcePath}/software + ${nameNode}/${outputPath}/software + + + + + + + + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + - ${jobTracker} - ${nameNode} eu.dnetlib.dhp.blacklist.ReadBlacklistFromDB --hdfsPath${workingDir}/blacklist --hdfsNameNode${nameNode} @@ -66,6 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${workingDir}/mergesRelation @@ -90,6 +184,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 --sourcePath${sourcePath}/relation --outputPath${outputPath}/relation @@ -99,5 +194,7 @@ - + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java index 0694556b2..9e5d98644 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/EventFactory.java @@ -29,31 +29,32 @@ public class EventFactory { "yyyy-MM-dd" }; - public static Event newBrokerEvent(final Result source, final Result target, final UpdateInfo updateInfo) { + public static Event newBrokerEvent(final UpdateInfo updateInfo) { final long now = new Date().getTime(); final Event res = new Event(); - final Map map = createMapFromResult(target, source, updateInfo); + final Map map = createMapFromResult(updateInfo); - final String payload = createPayload(target, updateInfo); + final String payload = createPayload(updateInfo); final String eventId = calculateEventId( - updateInfo.getTopic(), target.getOriginalId().get(0), updateInfo.getHighlightValueAsString()); + updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0), + updateInfo.getHighlightValueAsString()); res.setEventId(eventId); res.setProducerId(PRODUCER_ID); res.setPayload(payload); res.setMap(map); - res.setTopic(updateInfo.getTopic()); + res.setTopic(updateInfo.getTopicPath()); res.setCreationDate(now); res.setExpiryDate(calculateExpiryDate(now)); res.setInstantMessage(false); return res; } - private static String createPayload(final Result result, final UpdateInfo updateInfo) { + private static String createPayload(final UpdateInfo updateInfo) { final OpenAireEventPayload payload = new OpenAireEventPayload(); // TODO @@ -62,32 +63,34 @@ public class EventFactory { return payload.toJSON(); } - private static Map createMapFromResult(final Result oaf, final Result source, - final UpdateInfo updateInfo) { + private static Map createMapFromResult(final UpdateInfo updateInfo) { final Map map = new HashMap<>(); - final List collectedFrom = oaf.getCollectedfrom(); + final Result source = updateInfo.getSource(); + final Result target = updateInfo.getTarget(); + + final List collectedFrom = target.getCollectedfrom(); if (collectedFrom.size() == 1) { map.put("target_datasource_id", collectedFrom.get(0).getKey()); map.put("target_datasource_name", collectedFrom.get(0).getValue()); } - final List ids = oaf.getOriginalId(); + final List ids = target.getOriginalId(); if (ids.size() > 0) { map.put("target_publication_id", ids.get(0)); } - final List titles = oaf.getTitle(); + final List titles = target.getTitle(); if (titles.size() > 0) { map.put("target_publication_title", titles.get(0)); } - final long date = parseDateTolong(oaf.getDateofacceptance().getValue()); + final long date = parseDateTolong(target.getDateofacceptance().getValue()); if (date > 0) { map.put("target_dateofacceptance", date); } - final List subjects = oaf.getSubject(); + final List subjects = target.getSubject(); if (subjects.size() > 0) { map .put( @@ -95,7 +98,7 @@ public class EventFactory { subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList())); } - final List authors = oaf.getAuthor(); + final List authors = target.getAuthor(); if (authors.size() > 0) { map .put( diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java new file mode 100644 index 000000000..29f6cbe3a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/model/Topic.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.dhp.broker.model; + +public enum Topic { + + // ENRICHMENT MISSING + ENRICH_MISSING_OA_VERSION("ENRICH/MISSING/OPENACCESS_VERSION"), ENRICH_MISSING_ABSTRACT( + "ENRICH/MISSING/ABSTRACT"), ENRICH_MISSING_PUBLICATION_DATE( + "ENRICH/MISSING/PUBLICATION_DATE"), ENRICH_MISSING_PID( + "ENRICH/MISSING/PID"), ENRICH_MISSING_PROJECT("ENRICH/MISSING/PROJECT"), ENRICH_MISSING_SOFTWARE( + "ENRICH/MISSING/SOFTWARE"), ENRICH_MISSING_SUBJECT_MESHEUROPMC( + "ENRICH/MISSING/SUBJECT/MESHEUROPMC"), ENRICH_MISSING_SUBJECT_ARXIV( + "ENRICH/MISSING/SUBJECT/ARXIV"), ENRICH_MISSING_SUBJECT_JEL( + "ENRICH/MISSING/SUBJECT/JEL"), ENRICH_MISSING_SUBJECT_DDC( + "ENRICH/MISSING/SUBJECT/DDC"), ENRICH_MISSING_SUBJECT_ACM( + "ENRICH/MISSING/SUBJECT/ACM"), ENRICH_MISSING_SUBJECT_RVK( + "ENRICH/MISSING/SUBJECT/RVK"), ENRICH_MISSING_AUTHOR_ORCID( + "ENRICH/MISSING/AUTHOR/ORCID"), + + // ENRICHMENT MORE + ENRICH_MORE_PID("ENRICH/MORE/PID"), ENRICH_MORE_OA_VERSION("ENRICH/MORE/OPENACCESS_VERSION"), ENRICH_MORE_ABSTRACT( + "ENRICH/MORE/ABSTRACT"), ENRICH_MORE_PUBLICATION_DATE("ENRICH/MORE/PUBLICATION_DATE"), ENRICH_MORE_PROJECT( + "ENRICH/MORE/PROJECT"), ENRICH_MORE_SUBJECT_MESHEUROPMC( + "ENRICH/MORE/SUBJECT/MESHEUROPMC"), ENRICH_MORE_SUBJECT_ARXIV( + "ENRICH/MORE/SUBJECT/ARXIV"), ENRICH_MORE_SUBJECT_JEL( + "ENRICH/MORE/SUBJECT/JEL"), ENRICH_MORE_SUBJECT_DDC( + "ENRICH/MORE/SUBJECT/DDC"), ENRICH_MORE_SUBJECT_ACM( + "ENRICH/MORE/SUBJECT/ACM"), ENRICH_MORE_SUBJECT_RVK("ENRICH/MORE/SUBJECT/RVK"), + + // ADDITION + ADD_BY_PROJECT("ADD/BY_PROJECT"); + + Topic(final String path) { + this.path = path; + } + + protected String path; + + public String getPath() { + return this.path; + } + + public static Topic fromPath(final String path) { + for (final Topic t : Topic.values()) { + if (t.getPath().equals(path)) { + return t; + } + } + return null; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 54d4ef36a..43ebd6dd8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -14,21 +14,20 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.broker.model.Event; import eu.dnetlib.dhp.broker.model.EventFactory; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAbstract; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingAuthorOrcid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingOpenAccess; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingProject; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingPublicationDate; -import eu.dnetlib.dhp.broker.oa.util.EnrichMissingSubject; -import eu.dnetlib.dhp.broker.oa.util.EnrichMoreOpenAccess; -import eu.dnetlib.dhp.broker.oa.util.EnrichMorePid; -import eu.dnetlib.dhp.broker.oa.util.EnrichMoreSubject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid; +import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject; +import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.Result; @@ -37,7 +36,16 @@ public class GenerateEventsApplication { private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final UpdateMatcher enrichMissingAbstract = new EnrichMissingAbstract(); + private static final UpdateMatcher enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid(); + private static final UpdateMatcher enrichMissingOpenAccess = new EnrichMissingOpenAccess(); + private static final UpdateMatcher enrichMissingPid = new EnrichMissingPid(); + private static final UpdateMatcher enrichMissingProject = new EnrichMissingProject(); + private static final UpdateMatcher enrichMissingPublicationDate = new EnrichMissingPublicationDate(); + private static final UpdateMatcher enrichMissingSubject = new EnrichMissingSubject(); + private static final UpdateMatcher enrichMoreOpenAccess = new EnrichMoreOpenAccess(); + private static final UpdateMatcher enrichMorePid = new EnrichMorePid(); + private static final UpdateMatcher enrichMoreSubject = new EnrichMoreSubject(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -76,37 +84,22 @@ public class GenerateEventsApplication { } private List generateEvents(final Result... children) { - final List list = new ArrayList<>(); + final List> list = new ArrayList<>(); - for (final Result source : children) { - for (final Result target : children) { - if (source != target) { - list - .addAll( - findUpdates(source, target) - .stream() - .map(info -> EventFactory.newBrokerEvent(source, target, info)) - .collect(Collectors.toList())); - } - } + for (final Result target : children) { + list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingProject.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children)); + list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children)); + list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children)); + list.addAll(enrichMorePid.searchUpdatesForRecord(target, children)); + list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children)); } - return list; - } - - private List> findUpdates(final Result source, final Result target) { - final List> list = new ArrayList<>(); - list.addAll(EnrichMissingAbstract.findUpdates(source, target)); - list.addAll(EnrichMissingAuthorOrcid.findUpdates(source, target)); - list.addAll(EnrichMissingOpenAccess.findUpdates(source, target)); - list.addAll(EnrichMissingPid.findUpdates(source, target)); - list.addAll(EnrichMissingProject.findUpdates(source, target)); - list.addAll(EnrichMissingPublicationDate.findUpdates(source, target)); - list.addAll(EnrichMissingSubject.findUpdates(source, target)); - list.addAll(EnrichMoreOpenAccess.findUpdates(source, target)); - list.addAll(EnrichMorePid.findUpdates(source, target)); - list.addAll(EnrichMoreSubject.findUpdates(source, target)); - return list; + return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java new file mode 100644 index 000000000..43cf738f8 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAbstract.java @@ -0,0 +1,36 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingAbstract extends UpdateMatcher { + + public EnrichMissingAbstract() { + super(false); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) { + return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target)); + } + return new ArrayList<>(); + } + + @Override + public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_ABSTRACT, + highlightValue, source, target, + (p, s) -> p.getAbstracts().add(s), + s -> s); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java new file mode 100644 index 000000000..beeccdbe8 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingAuthorOrcid.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingAuthorOrcid extends UpdateMatcher> { + + public EnrichMissingAuthorOrcid() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { + // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); + return Arrays.asList(); + } + + @Override + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_AUTHOR_ORCID, + highlightValue, source, target, + (p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java new file mode 100644 index 000000000..a4a2ea0c6 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingOpenAccess.java @@ -0,0 +1,55 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingOpenAccess extends UpdateMatcher { + + public EnrichMissingOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final long count = target + .getInstance() + .stream() + .map(i -> i.getAccessright().getClassid()) + .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) + .count(); + + if (count > 0) { + return Arrays.asList(); + } + + return source + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(ConversionUtils::oafInstanceToBrokerInstances) + .flatMap(s -> s) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Instance highlightValue, + final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java new file mode 100644 index 000000000..a8df62541 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPid.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPid extends UpdateMatcher { + + public EnrichMissingPid() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final long count = target.getPid().size(); + + if (count > 0) { + return Arrays.asList(); + } + + return source + .getPid() + .stream() + .map(ConversionUtils::oafPidToBrokerPid) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PID, + highlightValue, source, target, + (p, pid) -> p.getPids().add(pid), + pid -> pid.getType() + "::" + pid.getValue()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java new file mode 100644 index 000000000..b6e5b3b57 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingProject.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import eu.dnetlib.broker.objects.Project; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingProject extends UpdateMatcher { + + public EnrichMissingProject() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo(final Project highlightValue, + final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PROJECT, + highlightValue, source, target, + (p, prj) -> p.getProjects().add(prj), + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java new file mode 100644 index 000000000..e9ec082c4 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingPublicationDate.java @@ -0,0 +1,33 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.List; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMissingPublicationDate extends UpdateMatcher { + + public EnrichMissingPublicationDate() { + super(false); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); + return Arrays.asList(); + } + + @Override + public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MISSING_PUBLICATION_DATE, + highlightValue, source, target, + (p, date) -> p.setPublicationdate(date), + s -> s); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java new file mode 100644 index 000000000..79e9d469b --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMissingSubject.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class EnrichMissingSubject extends UpdateMatcher> { + + public EnrichMissingSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { + final Set existingTypes = target + .getSubject() + .stream() + .map(StructuredProperty::getQualifier) + .map(Qualifier::getClassid) + .collect(Collectors.toSet()); + + return source + .getPid() + .stream() + .filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid())) + .map(ConversionUtils::oafSubjectToPair) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, + final Result target) { + + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java new file mode 100644 index 000000000..40c9b0500 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreOpenAccess.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMoreOpenAccess extends UpdateMatcher { + + public EnrichMoreOpenAccess() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final Set urls = target + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(i -> i.getUrl()) + .flatMap(List::stream) + .collect(Collectors.toSet()); + + return source + .getInstance() + .stream() + .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) + .map(ConversionUtils::oafInstanceToBrokerInstances) + .flatMap(s -> s) + .filter(i -> !urls.contains(i.getUrl())) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Instance highlightValue, + final Result source, + final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_OA_VERSION, + highlightValue, source, target, + (p, i) -> p.getInstances().add(i), + Instance::getUrl); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java new file mode 100644 index 000000000..0e7b7766a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMorePid.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMorePid extends UpdateMatcher { + + public EnrichMorePid() { + super(true); + } + + @Override + protected List> findUpdates(final Result source, final Result target) { + final Set existingPids = target + .getPid() + .stream() + .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue()) + .collect(Collectors.toSet()); + + return source + .getPid() + .stream() + .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) + .map(ConversionUtils::oafPidToBrokerPid) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + return new UpdateInfo<>( + Topic.ENRICH_MORE_PID, + highlightValue, source, target, + (p, pid) -> p.getPids().add(pid), + pid -> pid.getType() + "::" + pid.getValue()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java new file mode 100644 index 000000000..e6374479b --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/EnrichMoreSubject.java @@ -0,0 +1,50 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class EnrichMoreSubject extends UpdateMatcher> { + + public EnrichMoreSubject() { + super(true); + } + + @Override + protected List>> findUpdates(final Result source, final Result target) { + final Set existingSubjects = target + .getSubject() + .stream() + .map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue()) + .collect(Collectors.toSet()); + + return source + .getPid() + .stream() + .filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) + .map(ConversionUtils::oafSubjectToPair) + .map(i -> generateUpdateInfo(i, source, target)) + .collect(Collectors.toList()); + } + + @Override + public UpdateInfo> generateUpdateInfo(final Pair highlightValue, + final Result source, + final Result target) { + + return new UpdateInfo<>( + Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), + highlightValue, source, target, + (p, pair) -> p.getSubjects().add(pair.getRight()), + pair -> pair.getLeft() + "::" + pair.getRight()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java new file mode 100644 index 000000000..b8b6132cd --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -0,0 +1,64 @@ + +package eu.dnetlib.dhp.broker.oa.matchers; + +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Result; + +public abstract class UpdateMatcher { + + private final boolean multipleUpdate; + + public UpdateMatcher(final boolean multipleUpdate) { + this.multipleUpdate = multipleUpdate; + } + + public Collection> searchUpdatesForRecord(final Result res, final Result... others) { + + final Map> infoMap = new HashMap<>(); + + for (final Result source : others) { + if (source != res) { + for (final UpdateInfo info : findUpdates(source, res)) { + final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { + } else { + infoMap.put(s, info); + } + } + } + } + + final Collection> values = infoMap.values(); + + if (values.isEmpty() || multipleUpdate) { + return values; + } else { + final UpdateInfo v = values + .stream() + .sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust())) + .findFirst() + .get(); + return Arrays.asList(v); + } + } + + protected abstract List> findUpdates(Result source, Result target); + + protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, final Result source, + final Result target); + + protected static boolean isMissing(final List> list) { + return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java new file mode 100644 index 000000000..d61d5bfb7 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -0,0 +1,7 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +public class BrokerConstants { + + public final static String OPEN_ACCESS = "OPEN"; +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java new file mode 100644 index 000000000..2e2ce202a --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -0,0 +1,36 @@ + +package eu.dnetlib.dhp.broker.oa.util; + +import java.util.stream.Stream; + +import org.apache.commons.lang3.tuple.Pair; + +import eu.dnetlib.broker.objects.Instance; +import eu.dnetlib.broker.objects.Pid; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class ConversionUtils { + + public static Stream oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) { + return i.getUrl().stream().map(url -> { + final Instance r = new Instance(); + r.setUrl(url); + r.setInstancetype(i.getInstancetype().getClassid()); + r.setLicense(BrokerConstants.OPEN_ACCESS); + r.setHostedby(i.getHostedby().getValue()); + return r; + }); + } + + public static Pid oafPidToBrokerPid(final StructuredProperty sp) { + final Pid pid = new Pid(); + pid.setValue(sp.getValue()); + pid.setType(sp.getQualifier().getClassid()); + return pid; + } + + public static final Pair oafSubjectToPair(final StructuredProperty sp) { + return Pair.of(sp.getQualifier().getClassid(), sp.getValue()); + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java deleted file mode 100644 index 493d1f97c..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAbstract.java +++ /dev/null @@ -1,31 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingAbstract extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingAbstract(final String highlightValue, final float trust) { - super("ENRICH/MISSING/ABSTRACT", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getAbstracts().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java deleted file mode 100644 index 6899c62a3..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingAuthorOrcid.java +++ /dev/null @@ -1,31 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingAuthorOrcid extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingAuthorOrcid(final String highlightValue, final float trust) { - super("ENRICH/MISSING/AUTHOR/ORCID", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - // TODO - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java deleted file mode 100644 index 9464130f3..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingOpenAccess.java +++ /dev/null @@ -1,32 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingOpenAccess extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingOpenAccess(final Instance highlightValue, final float trust) { - super("ENRICH/MISSING/OPENACCESS_VERSION", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getInstances().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getUrl(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java deleted file mode 100644 index 293d4993f..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPid.java +++ /dev/null @@ -1,32 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.broker.objects.Pid; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingPid extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingPid(final Pid highlightValue, final float trust) { - super("ENRICH/MISSING/PID", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getPids().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getType() + "::" + getHighlightValue().getValue(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java deleted file mode 100644 index a22c179a2..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingProject.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.broker.objects.Project; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingProject extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingProject(final Project highlightValue, final float trust) { - super("ENRICH/MISSING/PROJECT", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getProjects().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getFunder() + "::" + getHighlightValue().getFundingProgram() - + getHighlightValue().getCode(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java deleted file mode 100644 index 869dca264..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingPublicationDate.java +++ /dev/null @@ -1,31 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingPublicationDate extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMissingPublicationDate(final String highlightValue, final float trust) { - super("ENRICH/MISSING/PUBLICATION_DATE", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().setPublicationdate(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java deleted file mode 100644 index a2ed5d043..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMissingSubject.java +++ /dev/null @@ -1,36 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMissingSubject extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // MESHEUROPMC - // ARXIV - // JEL - // DDC - // ACM - - return Arrays.asList(); - } - - private EnrichMissingSubject(final String subjectClassification, final String highlightValue, final float trust) { - super("ENRICH/MISSING/SUBJECT/" + subjectClassification, highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getSubjects().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java deleted file mode 100644 index 4f1e88d3d..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreOpenAccess.java +++ /dev/null @@ -1,32 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.Instance; -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMoreOpenAccess extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMoreOpenAccess(final Instance highlightValue, final float trust) { - super("ENRICH/MORE/OPENACCESS_VERSION", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getInstances().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getUrl(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java deleted file mode 100644 index ecf2cf310..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMorePid.java +++ /dev/null @@ -1,32 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.broker.objects.Pid; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMorePid extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); - return Arrays.asList(); - } - - private EnrichMorePid(final Pid highlightValue, final float trust) { - super("ENRICH/MORE/PID", highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getPids().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue().getType() + "::" + getHighlightValue().getValue(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java deleted file mode 100644 index f29b86292..000000000 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EnrichMoreSubject.java +++ /dev/null @@ -1,36 +0,0 @@ - -package eu.dnetlib.dhp.broker.oa.util; - -import java.util.Arrays; -import java.util.List; - -import eu.dnetlib.broker.objects.OpenAireEventPayload; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class EnrichMoreSubject extends UpdateInfo { - - public static List findUpdates(final Result source, final Result target) { - // MESHEUROPMC - // ARXIV - // JEL - // DDC - // ACM - - return Arrays.asList(); - } - - private EnrichMoreSubject(final String subjectClassification, final String highlightValue, final float trust) { - super("ENRICH/MORE/SUBJECT/" + subjectClassification, highlightValue, trust); - } - - @Override - public void compileHighlight(final OpenAireEventPayload payload) { - payload.getHighlight().getSubjects().add(getHighlightValue()); - } - - @Override - public String getHighlightValueAsString() { - return getHighlightValue(); - } - -} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index f7b6b69e9..5cc0d371d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -1,36 +1,77 @@ package eu.dnetlib.dhp.broker.oa.util; +import java.util.function.BiConsumer; +import java.util.function.Function; + import eu.dnetlib.broker.objects.OpenAireEventPayload; +import eu.dnetlib.broker.objects.Publication; +import eu.dnetlib.dhp.broker.model.Topic; +import eu.dnetlib.dhp.schema.oaf.Result; -public abstract class UpdateInfo { +public final class UpdateInfo { - private final String topic; + private final Topic topic; private final T highlightValue; + private final Result source; + + private final Result target; + + private final BiConsumer compileHighlight; + + private final Function highlightToString; + private final float trust; - protected UpdateInfo(final String topic, final T highlightValue, final float trust) { + public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, + final BiConsumer compileHighlight, + final Function highlightToString) { this.topic = topic; this.highlightValue = highlightValue; - this.trust = trust; + this.source = source; + this.target = target; + this.compileHighlight = compileHighlight; + this.highlightToString = highlightToString; + this.trust = calculateTrust(source, target); } public T getHighlightValue() { return highlightValue; } + public Result getSource() { + return source; + } + + public Result getTarget() { + return target; + } + + private float calculateTrust(final Result source, final Result target) { + // TODO + return 0.9f; + } + + protected Topic getTopic() { + return topic; + } + + public String getTopicPath() { + return topic.getPath(); + } + public float getTrust() { return trust; } - public String getTopic() { - return topic; + public void compileHighlight(final OpenAireEventPayload payload) { + compileHighlight.accept(payload.getHighlight(), getHighlightValue()); } - abstract public void compileHighlight(OpenAireEventPayload payload); - - abstract public String getHighlightValueAsString(); + public String getHighlightValueAsString() { + return highlightToString.apply(getHighlightValue()); + } } diff --git a/dhp-workflows/dhp-bulktag/project-default.properties b/dhp-workflows/dhp-bulktag/project-default.properties deleted file mode 100644 index 84a56f19f..000000000 --- a/dhp-workflows/dhp-bulktag/project-default.properties +++ /dev/null @@ -1,7 +0,0 @@ -#sandboxName when not provided explicitly will be generated -sandboxName=${sandboxName} -sandboxDir=/user/${dhp.hadoop.frontend.user.name}/${sandboxName} -workingDir=${sandboxDir}/working_dir -oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir} -oozieTopWfApplicationPath = ${oozie.wf.application.path} - diff --git a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java deleted file mode 100644 index 3aae9ebee..000000000 --- a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java +++ /dev/null @@ -1,166 +0,0 @@ - -package eu.dnetlib.dhp; - -import java.io.IOException; -import java.lang.reflect.InvocationTargetException; -import java.util.*; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.DocumentException; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import com.google.gson.Gson; - -import eu.dnetlib.dhp.community.CommunityConfiguration; -import eu.dnetlib.dhp.community.CommunityConfigurationFactory; -import eu.dnetlib.dhp.community.Constraint; -import eu.dnetlib.dhp.community.SelectionConstraints; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; - -/** Created by miriam on 03/08/2018. */ -public class CommunityConfigurationFactoryTest { - - private final VerbResolver resolver = new VerbResolver(); - - @Test - public void parseTest() throws DocumentException, IOException { - String xml = IOUtils - .toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml")); - final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); - Assertions.assertEquals(5, cc.size()); - cc - .getCommunityList() - .forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId()))); - } - - @Test - public void applyVerb() - throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, - InstantiationException { - Constraint sc = new Constraint(); - sc.setVerb("not_contains"); - sc.setField("contributor"); - sc.setValue("DARIAH"); - sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue())); - String metadata = "This work has been partially supported by DARIAH-EU infrastructure"; - Assertions.assertFalse(sc.verifyCriteria(metadata)); - } - - @Test - public void loadSelCriteriaTest() throws DocumentException, IOException { - String xml = IOUtils - .toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml")); - final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); - Map> param = new HashMap<>(); - param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi"))); - param - .put( - "description", - new ArrayList<>( - Collections - .singletonList( - "This work has been partially supported by DARIAH-EU infrastructure"))); - param - .put( - "contributor", - new ArrayList<>( - Collections - .singletonList( - "Pallino ha aiutato a scrivere il paper. Pallino lavora per DARIAH"))); - List comm = cc - .getCommunityForDatasource( - "openaire____::1cfdb2e14977f31a98e0118283401f32", param); - Assertions.assertEquals(1, comm.size()); - Assertions.assertEquals("dariah", comm.get(0)); - } - - @Test - public void test4() throws DocumentException, IOException { - final CommunityConfiguration cc = CommunityConfigurationFactory - .fromJson( - IOUtils - .toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json"))); - cc.toString(); - } - - @Test - public void test5() throws IOException, DocumentException { - - // final CommunityConfiguration cc = - // CommunityConfigurationFactory.newInstance(IOUtils.toString(getClass().getResourceAsStream("test.xml"))); - final CommunityConfiguration cc = CommunityConfigurationFactory - .fromJson( - IOUtils - .toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration.json"))); - - System.out.println(cc.toJson()); - } - - @Test - public void test6() { - String json = "{\"criteria\":[{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}]}"; - - String step1 = "{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}"; - - Constraint c = new Gson().fromJson(step1, Constraint.class); - // - // String step2 = - // "{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}"; - // - // ConstraintEncapsulator ce = new - // Gson().fromJson(step2,ConstraintEncapsulator.class); - // - // - // String step3 = - // "{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}"; - // - // Constraints cons = new Gson().fromJson(step3,Constraints.class); - // - // String step4 = - // "{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}"; - // - // ConstraintsList cl = new Gson().fromJson(step4,ConstraintsList.class); - // - // String step5 = - // "{\"cl\":{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}}"; - SelectionConstraints sl = new Gson().fromJson(json, SelectionConstraints.class); - } - - @Test - public void test7() throws IOException { - final CommunityConfiguration cc = CommunityConfigurationFactory - .fromJson( - IOUtils - .toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json"))); - - System.out.println(cc.toJson()); - } - - @Test - public void temporaneo() throws Exception { - String xml = IOUtils - .toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml")); - final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); - System.out.println(cc.toJson()); - } -} diff --git a/dhp-workflows/dhp-bulktag/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml similarity index 88% rename from dhp-workflows/dhp-bulktag/pom.xml rename to dhp-workflows/dhp-enrichment/pom.xml index 98922c193..fe9833e3e 100644 --- a/dhp-workflows/dhp-bulktag/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -9,7 +9,7 @@ 4.0.0 - dhp-bulktag + dhp-enrichment @@ -31,6 +31,12 @@ dhp-schemas ${project.version} + + org.apache.spark + spark-hive_2.11 + test + + dom4j dom4j @@ -43,12 +49,16 @@ com.jayway.jsonpath json-path + io.github.classgraph classgraph 4.8.71 + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java similarity index 80% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 8d2fede82..c8eb017c7 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -15,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class PropagationConstant { @@ -24,10 +26,6 @@ public class PropagationConstant { public static final String TRUE = "true"; - public static final String DNET_COUNTRY_SCHEMA = "dnet:countries"; - public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; - public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; - public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos"; public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories"; @@ -46,22 +44,6 @@ public class PropagationConstant { public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result"; public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations"; - public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "isProvidedBy"; - - public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization"; - public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation"; - public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf"; - public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution"; - - public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult"; - - public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject"; - public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome"; - public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy"; - public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces"; - - public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges"; - public static final String PROPAGATION_AUTHOR_PID = "ORCID"; public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); @@ -76,8 +58,8 @@ public class PropagationConstant { Country nc = new Country(); nc.setClassid(classid); nc.setClassname(classname); - nc.setSchemename(DNET_COUNTRY_SCHEMA); - nc.setSchemeid(DNET_COUNTRY_SCHEMA); + nc.setSchemename(ModelConstants.DNET_COUNTRY_TYPE); + nc.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE); nc .setDataInfo( getDataInfo( @@ -102,8 +84,8 @@ public class PropagationConstant { Qualifier pa = new Qualifier(); pa.setClassid(inference_class_id); pa.setClassname(inference_class_name); - pa.setSchemeid(DNET_SCHEMA_ID); - pa.setSchemename(DNET_SCHEMA_NAME); + pa.setSchemeid(ModelConstants.DNET_PID_TYPES); + pa.setSchemename(ModelConstants.DNET_PID_TYPES); return pa; } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java similarity index 94% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java index e62b4b4fc..1c65e8ade 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.bulktag; +import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Optional; @@ -19,8 +20,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.Gson; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.community.*; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.bulktag.community.*; +import eu.dnetlib.dhp.schema.oaf.Result; public class SparkBulkTagJob { @@ -84,6 +85,7 @@ public class SparkBulkTagJob { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); }); } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java similarity index 80% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java index a73ff4d3e..0f45d3beb 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Community.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.util.ArrayList; @@ -17,7 +17,7 @@ public class Community implements Serializable { private String id; private List subjects = new ArrayList<>(); - private List datasources = new ArrayList<>(); + private List providers = new ArrayList<>(); private List zenodoCommunities = new ArrayList<>(); public String toJson() { @@ -27,7 +27,7 @@ public class Community implements Serializable { public boolean isValid() { return !getSubjects().isEmpty() - || !getDatasources().isEmpty() + || !getProviders().isEmpty() || !getZenodoCommunities().isEmpty(); } @@ -47,12 +47,12 @@ public class Community implements Serializable { this.subjects = subjects; } - public List getDatasources() { - return datasources; + public List getProviders() { + return providers; } - public void setDatasources(List datasources) { - this.datasources = datasources; + public void setProviders(List providers) { + this.providers = providers; } public List getZenodoCommunities() { diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java similarity index 96% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java index c5bbb66eb..29ddde15f 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfiguration.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.util.ArrayList; @@ -16,8 +16,8 @@ import com.google.common.collect.Maps; import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter; -import eu.dnetlib.dhp.selectioncriteria.Selection; +import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter; +import eu.dnetlib.dhp.bulktag.criteria.Selection; /** Created by miriam on 02/08/2018. */ public class CommunityConfiguration implements Serializable { @@ -84,7 +84,7 @@ public class CommunityConfiguration implements Serializable { add(sbj.toLowerCase().trim(), p, subjectMap); } // get datasources - for (Datasource d : c.getDatasources()) { + for (Provider d : c.getProviders()) { add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap); } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java similarity index 86% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java index 508f0663d..607315f3f 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFactory.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.StringReader; import java.util.ArrayList; @@ -19,10 +19,10 @@ import com.google.common.collect.Maps; import com.google.gson.Gson; import com.google.gson.GsonBuilder; -import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter; -import eu.dnetlib.dhp.selectioncriteria.Selection; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; -import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory; +import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter; +import eu.dnetlib.dhp.bulktag.criteria.Selection; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory; /** Created by miriam on 03/08/2018. */ public class CommunityConfigurationFactory { @@ -77,7 +77,7 @@ public class CommunityConfigurationFactory { log.info(String.format("community id: %s", c.getId())); c.setSubjects(parseSubjects(node)); - c.setDatasources(parseDatasources(node)); + c.setProviders(parseDatasources(node)); c.setZenodoCommunities(parseZenodoCommunities(node)); return c; } @@ -96,17 +96,17 @@ public class CommunityConfigurationFactory { return subjects; } - private static List parseDatasources(final Node node) { + private static List parseDatasources(final Node node) { final List list = node.selectNodes("./datasources/datasource"); - final List datasourceList = new ArrayList<>(); + final List providerList = new ArrayList<>(); for (Node n : list) { - Datasource d = new Datasource(); + Provider d = new Provider(); d.setOpenaireId(n.selectSingleNode("./openaireId").getText()); d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver); - datasourceList.add(d); + providerList.add(d); } - log.info("size of the datasource list " + datasourceList.size()); - return datasourceList; + log.info("size of the datasource list " + providerList.size()); + return providerList; } private static List parseZenodoCommunities(final Node node) { diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java similarity index 86% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java index 54f381d4a..e0856ae8f 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java @@ -1,11 +1,11 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; -import eu.dnetlib.dhp.selectioncriteria.Selection; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; +import eu.dnetlib.dhp.bulktag.criteria.Selection; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; public class Constraint implements Serializable { private String verb; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java similarity index 94% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java index af095c513..b56dfaaa3 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraints.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; @@ -14,7 +14,7 @@ import org.apache.commons.logging.LogFactory; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; /** Created by miriam on 02/08/2018. */ public class Constraints implements Serializable { diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java similarity index 92% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java index 01cd3ce22..50e1836fa 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Pair.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java similarity index 80% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java index d48dce2c6..fd7481719 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ProtoMap.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.util.HashMap; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java similarity index 86% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java index a3d343087..b9c37f4dc 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Provider.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; @@ -9,11 +9,11 @@ import org.dom4j.Node; import com.google.gson.Gson; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; /** Created by miriam on 01/08/2018. */ -public class Datasource implements Serializable { - private static final Log log = LogFactory.getLog(Datasource.class); +public class Provider implements Serializable { + private static final Log log = LogFactory.getLog(Provider.class); private String openaireId; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java similarity index 98% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java index 2c18392c7..7ec2f916f 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/QueryInformationSystem.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.util.List; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java similarity index 96% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index eb531c6b1..f5a985d15 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -1,7 +1,8 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; -import static eu.dnetlib.dhp.community.TagginConstants.*; +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.io.Serializable; import java.util.*; @@ -239,8 +240,8 @@ public class ResultTagger implements Serializable { Qualifier pa = new Qualifier(); pa.setClassid(inference_class_id); pa.setClassname(inference_class_name); - pa.setSchemeid(DNET_SCHEMA_ID); - pa.setSchemename(DNET_SCHEMA_NAME); + pa.setSchemeid(DNET_PROVENANCE_ACTIONS); + pa.setSchemename(DNET_PROVENANCE_ACTIONS); return pa; } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java similarity index 91% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java index 802e2f5d6..71ff61d1b 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.lang.reflect.Type; @@ -10,7 +10,7 @@ import java.util.Map; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; public class SelectionConstraints implements Serializable { private List criteria; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java similarity index 66% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java index 92d37d089..3cdc7c941 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/TaggingConstants.java @@ -1,20 +1,14 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; -public class TagginConstants { +public class TaggingConstants { public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging"; - public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; - public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; - public static final String CLASS_ID_SUBJECT = "community:subject"; public static final String CLASS_ID_DATASOURCE = "community:datasource"; public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; - public static final String SCHEMA_ID = "dnet:provenanceActions"; - public static final String COUNTER_GROUP = "Bulk Tagging"; - public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/"; public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject"; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java similarity index 95% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java index e1492f6a5..bc6b75fba 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ZenodoCommunity.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.community; +package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java similarity index 91% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java index a6ef2d908..496630fa3 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerb.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java similarity index 92% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java index b8b0262e9..a4a6f5663 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/ContainsVerbIgnoreCase.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java similarity index 91% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java index 3f17a6bb3..b9088d012 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerb.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java similarity index 91% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java index 934406859..c5f0ce070 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/EqualVerbIgnoreCase.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java similarity index 96% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java index 9ef3bd60c..e9b948b2b 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/InterfaceAdapter.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.lang.reflect.Type; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java similarity index 91% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java index eb83b256e..03ec9804b 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerb.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java similarity index 92% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java index fab3efef3..b21be83f0 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotContainsVerbIgnoreCase.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java similarity index 91% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java index 2311c2987..86bf00012 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerb.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java similarity index 92% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java index de2f682a5..c6958a641 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/NotEqualVerbIgnoreCase.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java similarity index 60% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java index b488bda01..ec9fb716d 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/Selection.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; public interface Selection { diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java similarity index 86% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java index d467f934f..5b35919bd 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbClass.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java similarity index 90% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java index 6a8ceebc3..3d0db2063 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolver.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; @@ -22,12 +22,12 @@ public class VerbResolver implements Serializable { .verbose() // If you want to enable logging to stderr .enableAllInfo() // Scan classes, methods, fields, annotations .whitelistPackages( - "eu.dnetlib.dhp.selectioncriteria") // Scan com.xyz and subpackages + "eu.dnetlib.dhp.bulktag.criteria") // Scan com.xyz and subpackages .scan()) { // Perform the scan and return a ScanResult ClassInfoList routeClassInfoList = scanResult .getClassesWithAnnotation( - "eu.dnetlib.dhp.selectioncriteria.VerbClass"); + "eu.dnetlib.dhp.bulktag.criteria.VerbClass"); this.map = routeClassInfoList .stream() diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java similarity index 73% rename from dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java index 58bf60d42..0bb801999 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/criteria/VerbResolverFactory.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.selectioncriteria; +package eu.dnetlib.dhp.bulktag.criteria; public class VerbResolverFactory { diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java similarity index 97% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index e91a1e48a..98b573102 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; /** @@ -100,7 +101,7 @@ public class PrepareDatasourceCountryAssociation { + "JOIN ( SELECT source, target " + " FROM relation " + " WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + ModelConstants.IS_PROVIDED_BY + "' " + " AND datainfo.deletedbyinference = false ) rel " + "ON d.id = rel.source " diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java similarity index 95% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index 9dc17701b..974b3a3b1 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -69,13 +69,16 @@ public class SparkCountryPropagationJob { runWithSparkSession( conf, isSparkSessionManaged, - spark -> execPropagation( - spark, - sourcePath, - preparedInfoPath, - outputPath, - resultClazz, - saveGraph)); + spark -> { + removeOutputDir(spark, outputPath); + execPropagation( + spark, + sourcePath, + preparedInfoPath, + outputPath, + resultClazz, + saveGraph); + }); } private static void execPropagation( diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java similarity index 81% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 3e16b4b4b..b15f813ac 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -74,9 +74,7 @@ public class PrepareResultOrcidAssociationStep1 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); prepareInfo( spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel); }); @@ -97,22 +95,22 @@ public class PrepareResultOrcidAssociationStep1 { Dataset result = readPath(spark, inputResultPath, resultClazz); result.createOrReplaceTempView("result"); - String query = " select target resultId, author authorList" - + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " - + " from ( " - + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " - + " from result " - + " lateral view explode (author) a as MyT " - + " lateral view explode (MyT.pid) p as MyP " - + " where MyP.qualifier.classid = 'ORCID') tmp " - + " group by id) r_t " - + " join (" - + " select source, target " - + " from relation " - + " where datainfo.deletedbyinference = false " + String query = "SELECT target resultId, author authorList" + + " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " + + " FROM ( " + + " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " + + " FROM result " + + " LATERAL VIEW EXPLODE (author) a AS MyT " + + " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP " + + " WHERE MyP.qualifier.classid = 'ORCID') tmp " + + " GROUP BY id) r_t " + + " JOIN (" + + " SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + getConstraintList(" relclass = '", allowedsemrel) - + ") rel_rel " - + " on source = id"; + + " ) rel_rel " + + " ON source = id"; spark .sql(query) .as(Encoders.bean(ResultOrcidList.class)) diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java similarity index 97% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java index 65d8811bc..2cea32e58 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java @@ -50,9 +50,7 @@ public class PrepareResultOrcidAssociationStep2 { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); mergeInfo(spark, inputPath, outputPath); }); } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java similarity index 91% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index ebb75a5a6..bea847ca7 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -70,11 +70,10 @@ public class SparkOrcidToResultFromSemRelJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz); + } }); } @@ -132,16 +131,16 @@ public class SparkOrcidToResultFromSemRelJob { private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) { boolean toaddpid = false; - if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) { - if (StringUtils.isNoneEmpty(author.getSurname())) { + if (StringUtils.isNotEmpty(autoritative_author.getSurname())) { + if (StringUtils.isNotEmpty(author.getSurname())) { if (autoritative_author .getSurname() .trim() .equalsIgnoreCase(author.getSurname().trim())) { // have the same surname. Check the name - if (StringUtils.isNoneEmpty(autoritative_author.getName())) { - if (StringUtils.isNoneEmpty(author.getName())) { + if (StringUtils.isNotEmpty(autoritative_author.getName())) { + if (StringUtils.isNotEmpty(author.getName())) { if (autoritative_author .getName() .trim() @@ -150,12 +149,14 @@ public class SparkOrcidToResultFromSemRelJob { } // they could be differently written (i.e. only the initials of the name // in one of the two - if (autoritative_author - .getName() - .trim() - .substring(0, 0) - .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { - toaddpid = true; + else { + if (autoritative_author + .getName() + .trim() + .substring(0, 0) + .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { + toaddpid = true; + } } } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java similarity index 95% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index 05dcdc692..4cd7f88df 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -21,6 +21,7 @@ import com.google.gson.Gson; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareProjectResultsAssociation { @@ -60,6 +61,8 @@ public class PrepareProjectResultsAssociation { conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, potentialUpdatePath); + removeOutputDir(spark, alreadyLinkedPath); prepareResultProjProjectResults( spark, inputPath, @@ -83,7 +86,7 @@ public class PrepareProjectResultsAssociation { + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_RESULT_PROJECT_REL_CLASS + + ModelConstants.IS_PRODUCED_BY + "'"; Dataset resproj_relation = spark.sql(resproj_relation_query); diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java similarity index 94% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java index 36694b3dd..1f6264c18 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java @@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; @@ -122,9 +123,9 @@ public class SparkResultToProjectThroughSemRelJob { getRelation( resId, projectId, - RELATION_RESULT_PROJECT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, + ModelConstants.IS_PRODUCED_BY, + ModelConstants.RESULT_PROJECT, + ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); @@ -133,9 +134,9 @@ public class SparkResultToProjectThroughSemRelJob { getRelation( projectId, resId, - RELATION_PROJECT_RESULT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, + ModelConstants.PRODUCES, + ModelConstants.RESULT_PROJECT, + ModelConstants.OUTCOME, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java similarity index 96% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index e2d4d5687..5574aad75 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.Gson; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; public class PrepareResultCommunitySet { @@ -55,9 +56,7 @@ public class PrepareResultCommunitySet { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } + removeOutputDir(spark, outputPath); prepareInfo(spark, inputPath, outputPath, organizationMap); }); } @@ -76,13 +75,13 @@ public class PrepareResultCommunitySet { + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS + + ModelConstants.HAS_AUTHOR_INSTITUTION + "') result_organization " + "LEFT JOIN (SELECT source, collect_set(target) org_set " + " FROM relation " + " WHERE datainfo.deletedbyinference = false " + " AND relClass = '" - + RELATION_REPRESENTATIVERESULT_RESULT_CLASS + + ModelConstants.MERGES + "' " + " GROUP BY source) organization_organization " + "ON result_organization.target = organization_organization.source "; diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java similarity index 98% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java index 71275cc7f..66297e177 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java @@ -68,11 +68,10 @@ public class SparkResultToCommunityFromOrganizationJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); + } }); } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java similarity index 94% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index f8fe1668f..84e40fa88 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -17,6 +17,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Relation; @@ -58,30 +59,15 @@ public class PrepareResultInstRepoAssociation { isSparkSessionManaged, spark -> { readNeededResources(spark, inputPath); + + removeOutputDir(spark, datasourceOrganizationPath); prepareDatasourceOrganization(spark, datasourceOrganizationPath); + + removeOutputDir(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); }); } - private static void prepareAlreadyLinkedAssociation( - SparkSession spark, String alreadyLinkedPath) { - String query = "Select source resultId, collect_set(target) organizationSet " - + "from relation " - + "where datainfo.deletedbyinference = false " - + "and relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS - + "' " - + "group by source"; - - spark - .sql(query) - .as(Encoders.bean(ResultOrganizationSet.class)) - // TODO retry to stick with datasets - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); - } - private static void readNeededResources(SparkSession spark, String inputPath) { Dataset datasource = readPath(spark, inputPath + "/datasource", Datasource.class); datasource.createOrReplaceTempView("datasource"); @@ -106,7 +92,7 @@ public class PrepareResultInstRepoAssociation { + "JOIN ( SELECT source, target " + "FROM relation " + "WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + ModelConstants.IS_PROVIDED_BY + "' " + "AND datainfo.deletedbyinference = false ) rel " + "ON d.id = rel.source "; @@ -119,4 +105,24 @@ public class PrepareResultInstRepoAssociation { .option("compression", "gzip") .json(datasourceOrganizationPath); } + + private static void prepareAlreadyLinkedAssociation( + SparkSession spark, String alreadyLinkedPath) { + String query = "Select source resultId, collect_set(target) organizationSet " + + "from relation " + + "where datainfo.deletedbyinference = false " + + "and relClass = '" + + ModelConstants.HAS_AUTHOR_INSTITUTION + + "' " + + "group by source"; + + spark + .sql(query) + .as(Encoders.bean(ResultOrganizationSet.class)) + // TODO retry to stick with datasets + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); + } + } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java similarity index 93% rename from dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 86634d43f..0ce741b87 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; @@ -83,10 +84,8 @@ public class SparkResultToOrganizationFromIstRepoJob { conf, isSparkSessionManaged, spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) + removeOutputDir(spark, outputPath); + if (saveGraph) { execPropagation( spark, datasourceorganization, @@ -94,6 +93,7 @@ public class SparkResultToOrganizationFromIstRepoJob { inputPath, outputPath, resultClazz); + } }); } @@ -151,9 +151,9 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( orgId, resultId, - RELATION_ORGANIZATION_RESULT_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, + ModelConstants.IS_AUTHOR_INSTITUTION_OF, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); @@ -162,9 +162,9 @@ public class SparkResultToOrganizationFromIstRepoJob { getRelation( resultId, orgId, - RELATION_RESULT_ORGANIZATION_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, + ModelConstants.HAS_AUTHOR_INSTITUTION, + ModelConstants.RESULT_ORGANIZATION, + ModelConstants.AFFILIATION, PROPAGATION_DATA_INFO_TYPE, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); diff --git a/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json similarity index 100% rename from dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json diff --git a/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml similarity index 88% rename from dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index 524281bc9..f019f8413 100644 --- a/dhp-workflows/dhp-bulktag/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -18,6 +18,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -42,8 +53,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -53,8 +62,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -64,8 +71,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -75,8 +80,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -95,13 +98,11 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-publication eu.dnetlib.dhp.bulktag.SparkBulkTagJob - dhp-bulktag-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --num-executors=${sparkExecutorNumber} --executor-memory=${sparkExecutorMemory} @@ -124,13 +125,11 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-dataset eu.dnetlib.dhp.bulktag.SparkBulkTagJob - dhp-bulktag-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --num-executors=${sparkExecutorNumber} --executor-memory=${sparkExecutorMemory} @@ -153,13 +152,11 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-orp eu.dnetlib.dhp.bulktag.SparkBulkTagJob - dhp-bulktag-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --num-executors=${sparkExecutorNumber} --executor-memory=${sparkExecutorMemory} @@ -182,13 +179,11 @@ - ${jobTracker} - ${nameNode} yarn-cluster cluster bulkTagging-software eu.dnetlib.dhp.bulktag.SparkBulkTagJob - dhp-bulktag-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --num-executors=${sparkExecutorNumber} --executor-memory=${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml similarity index 94% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml index f269c5442..85116e4cc 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml @@ -19,6 +19,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -43,8 +54,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -54,18 +63,15 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization + - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -75,8 +81,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -92,7 +96,7 @@ cluster PrepareDatasourceCountryAssociation eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -126,7 +130,7 @@ cluster prepareResultCountry-Publication eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -156,7 +160,7 @@ cluster prepareResultCountry-Dataset eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -186,7 +190,7 @@ cluster prepareResultCountry-ORP eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -216,7 +220,7 @@ cluster prepareResultCountry-Software eu.dnetlib.dhp.countrypropagation.PrepareResultCountrySet - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -255,7 +259,7 @@ cluster countryPropagationForPublications eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -285,7 +289,7 @@ cluster countryPropagationForDataset eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -315,7 +319,7 @@ cluster countryPropagationForORP eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -345,7 +349,7 @@ cluster countryPropagationForSoftware eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml similarity index 74% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml index 7b06b6504..5ddc5fedf 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml @@ -57,6 +57,7 @@ + ${jobTracker} @@ -81,7 +82,6 @@ - @@ -95,7 +95,7 @@ cluster ORCIDPropagation-PreparePhase1-Publications eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -111,16 +111,11 @@ --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --sourcePath - ${sourcePath} - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.Publication - --outputPath - ${workingDir}/preparedInfo/targetOrcidAssoc - --allowedsemrels - ${allowedsemrels} + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} @@ -132,7 +127,7 @@ cluster ORCIDPropagation-PreparePhase1-Dataset eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -144,16 +139,11 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --sourcePath - ${sourcePath} - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.Dataset - --outputPath - ${workingDir}/preparedInfo/targetOrcidAssoc - --allowedsemrels - ${allowedsemrels} + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} @@ -165,7 +155,7 @@ cluster ORCIDPropagation-PreparePhase1-ORP eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -177,16 +167,11 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --sourcePath - ${sourcePath} - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath - ${workingDir}/preparedInfo/targetOrcidAssoc - --allowedsemrels - ${allowedsemrels} + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} @@ -198,7 +183,7 @@ cluster ORCIDPropagation-PreparePhase1-Software eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -210,16 +195,11 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --sourcePath - ${sourcePath} - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.Software - --outputPath - ${workingDir}/preparedInfo/targetOrcidAssoc - --allowedsemrels - ${allowedsemrels} + --sourcePath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/preparedInfo/targetOrcidAssoc + --allowedsemrels${allowedsemrels} @@ -233,7 +213,7 @@ cluster ORCIDPropagation-PreparePhase2 eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep2 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -245,16 +225,13 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --sourcePath - ${workingDir}/preparedInfo/targetOrcidAssoc - --outputPath - ${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${workingDir}/preparedInfo/targetOrcidAssoc + --outputPath${workingDir}/preparedInfo/mergedOrcidAssoc - - + @@ -268,7 +245,7 @@ cluster ORCIDPropagation-Publication eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -284,29 +261,24 @@ --conf spark.hadoop.mapreduce.reduce.speculative=false --conf spark.sql.shuffle.partitions=3840 - --possibleUpdatesPath - ${workingDir}/preparedInfo/mergedOrcidAssoc - --sourcePath - ${sourcePath}/publication - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.Publication - --outputPath - ${outputPath}/publication - --saveGraph - ${saveGraph} + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/publication + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${outputPath}/publication + --saveGraph${saveGraph} + yarn cluster ORCIDPropagation-Dataset eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -321,29 +293,24 @@ --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --possibleUpdatesPath - ${workingDir}/preparedInfo/mergedOrcidAssoc - --sourcePath - ${sourcePath}/dataset - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.Dataset - --outputPath - ${outputPath}/dataset - --saveGraph - ${saveGraph} + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/dataset + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${outputPath}/dataset + --saveGraph${saveGraph} + yarn cluster ORCIDPropagation-ORP eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -358,29 +325,24 @@ --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --possibleUpdatesPath - ${workingDir}/preparedInfo/mergedOrcidAssoc - --sourcePath - ${sourcePath}/otherresearchproduct - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath - ${outputPath}/otherresearchproduct - --saveGraph - ${saveGraph} + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/otherresearchproduct + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${outputPath}/otherresearchproduct + --saveGraph${saveGraph} + yarn cluster ORCIDPropagation-Software eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -395,22 +357,19 @@ --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --possibleUpdatesPath - ${workingDir}/preparedInfo/mergedOrcidAssoc - --sourcePath - ${sourcePath}/software - --hive_metastore_uris - ${hive_metastore_uris} - --resultTableName - eu.dnetlib.dhp.schema.oaf.Software - --outputPath - ${outputPath}/software - --saveGraph - ${saveGraph} + --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${sourcePath}/software + --hive_metastore_uris${hive_metastore_uris} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${outputPath}/software + --saveGraph${saveGraph} + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml new file mode 100644 index 000000000..9e91c06fb --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml @@ -0,0 +1,184 @@ + + + + sourcePath + the source path + + + allowedsemrels + the allowed semantics + + + outputPath + the output path + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + ${nameNode}/${sourcePath}/relation + ${nameNode}/${outputPath}/relation + + + + + + + + ${nameNode}/${sourcePath}/publication + ${nameNode}/${outputPath}/publication + + + + + + + + ${nameNode}/${sourcePath}/dataset + ${nameNode}/${outputPath}/dataset + + + + + + + + ${nameNode}/${sourcePath}/otherresearchproduct + ${nameNode}/${outputPath}/otherresearchproduct + + + + + + + + ${nameNode}/${sourcePath}/software + ${nameNode}/${outputPath}/software + + + + + + + + ${nameNode}/${sourcePath}/organization + ${nameNode}/${outputPath}/organization + + + + + + + + ${nameNode}/${sourcePath}/project + ${nameNode}/${outputPath}/project + + + + + + + + ${nameNode}/${sourcePath}/datasource + ${nameNode}/${outputPath}/datasource + + + + + + + + + + yarn + cluster + PrepareProjectResultsAssociation + eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath}/relation + --allowedsemrels${allowedsemrels} + --hive_metastore_uris${hive_metastore_uris} + --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + yarn + cluster + ProjectToResultPropagation + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --saveGraph${saveGraph} + --hive_metastore_uris${hive_metastore_uris} + --outputPath${outputPath}/relation + --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml similarity index 93% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml index 3be69bde6..6a329fdc4 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml @@ -14,6 +14,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -38,8 +49,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -49,8 +58,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -60,8 +67,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -71,8 +76,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -88,7 +91,7 @@ cluster Prepare-Community-Result-Organization eu.dnetlib.dhp.resulttocommunityfromorganization.PrepareResultCommunitySet - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -101,8 +104,8 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/relation - --hive_metastore_uris${hive_metastore_uris} --outputPath${workingDir}/preparedInfo/resultCommunityList + --hive_metastore_uris${hive_metastore_uris} --organizationtoresultcommunitymap${organizationtoresultcommunitymap} @@ -122,7 +125,7 @@ cluster community2resultfromorganization-Publication eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -136,9 +139,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/publication + --outputPath${outputPath}/publication --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${outputPath}/publication --saveGraph${saveGraph} @@ -151,7 +154,7 @@ cluster community2resultfromorganization-Dataset eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -165,9 +168,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/dataset + --outputPath${outputPath}/dataset --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${outputPath}/dataset --saveGraph${saveGraph} @@ -180,7 +183,7 @@ cluster community2resultfromorganization-ORP eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -194,9 +197,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/otherresearchproduct + --outputPath${outputPath}/otherresearchproduct --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${outputPath}/otherresearchproduct --saveGraph${saveGraph} @@ -209,7 +212,7 @@ cluster community2resultfromorganization-Software eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -223,9 +226,9 @@ --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList --sourcePath${sourcePath}/software + --outputPath${outputPath}/software --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${outputPath}/software --saveGraph${saveGraph} diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml similarity index 96% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml index b75b2d31e..81b51443c 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml @@ -99,7 +99,7 @@ cluster ResultToCommunitySemRel-PreparePhase1-Publications eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -128,7 +128,7 @@ cluster ResultToCommunitySemRel-PreparePhase1-Dataset eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -150,13 +150,14 @@ + yarn cluster ResultToCommunitySemRel-PreparePhase1-ORP eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -185,7 +186,7 @@ cluster ResultToCommunitySemRel-PreparePhase1-Software eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -216,7 +217,7 @@ cluster ResultToCommunityEmRelPropagation-PreparePhase2 eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep2 - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -232,9 +233,7 @@ --outputPath${workingDir}/preparedInfo/mergedCommunityAssoc - - @@ -250,7 +249,7 @@ cluster Result2CommunitySemRelPropagation-Publication eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -279,7 +278,7 @@ cluster Result2CommunitySemRelPropagation-Dataset eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -308,7 +307,7 @@ cluster Result2CommunitySemRelPropagation-ORP eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -337,7 +336,7 @@ cluster Result2CommunitySemRelPropagation-Software eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml similarity index 91% rename from dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml index 73268fcc7..e0563abae 100644 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml @@ -10,6 +10,17 @@ + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + @@ -38,8 +49,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/relation ${nameNode}/${outputPath}/relation @@ -49,8 +58,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/publication ${nameNode}/${outputPath}/publication @@ -60,8 +67,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/dataset ${nameNode}/${outputPath}/dataset @@ -71,8 +76,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/otherresearchproduct ${nameNode}/${outputPath}/otherresearchproduct @@ -82,8 +85,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/software ${nameNode}/${outputPath}/software @@ -93,8 +94,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/organization ${nameNode}/${outputPath}/organization @@ -104,8 +103,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/project ${nameNode}/${outputPath}/project @@ -115,8 +112,6 @@ - ${jobTracker} - ${nameNode} ${nameNode}/${sourcePath}/datasource ${nameNode}/${outputPath}/datasource @@ -125,13 +120,14 @@ + yarn cluster PrepareResultOrganizationAssociation eu.dnetlib.dhp.resulttoorganizationfrominstrepo.PrepareResultInstRepoAssociation - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -163,7 +159,7 @@ cluster resultToOrganizationFromInstRepoPropagationForPublications eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -176,12 +172,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/publication - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication @@ -193,7 +189,7 @@ cluster resultToOrganizationFromInstRepoPropagationForDataset eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -206,12 +202,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/dataset - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset @@ -223,7 +219,7 @@ cluster resultToOrganizationFromInstRepoPropagationForORP eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -236,12 +232,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/otherresearchproduct - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct @@ -253,7 +249,7 @@ cluster resultToOrganizationFromInstRepoPropagationForSoftware eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob - dhp-propagation-${projectVersion}.jar + dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} --executor-memory=${sparkExecutorMemory} @@ -266,12 +262,12 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/software - --hive_metastore_uris${hive_metastore_uris} - --saveGraph${saveGraph} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Software --outputPath${outputPath}/relation --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --hive_metastore_uris${hive_metastore_uris} + --saveGraph${saveGraph} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software diff --git a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java similarity index 71% rename from dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java index 75ecb0298..72e0a63fa 100644 --- a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/BulkTagJobTest.java @@ -1,13 +1,14 @@ -package eu.dnetlib.dhp; +package eu.dnetlib.dhp.bulktag; -import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR; +import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -18,37 +19,43 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import org.mortbay.util.IO; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.bulktag.SparkBulkTagJob; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Software; public class BulkTagJobTest { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader(); + public static final String MOCK_IS_LOOK_UP_URL = "BASEURL:8280/is/services/isLookUp"; + + public static final String pathMap = "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}"; private static SparkSession spark; private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class); + + private static final Logger log = LoggerFactory.getLogger(BulkTagJobTest.class); private static String taggingConf = ""; static { try { - taggingConf = IO + taggingConf = IOUtils .toString( BulkTagJobTest.class .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml")); + "/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml")); } catch (IOException e) { e.printStackTrace(); } @@ -56,11 +63,11 @@ public class BulkTagJobTest { @BeforeAll public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); + workingDir = Files.createTempDirectory(BulkTagJobTest.class.getSimpleName()); log.info("using work dir {}", workingDir); SparkConf conf = new SparkConf(); - conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); + conf.setAppName(BulkTagJobTest.class.getSimpleName()); conf.setMaster("local[*]"); conf.set("spark.driver.host", "localhost"); @@ -84,34 +91,22 @@ public class BulkTagJobTest { @Test public void noUpdatesTest() throws Exception { + final String pathMap = BulkTagJobTest.pathMap; SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), "-sourcePath", - getClass().getResource("/eu/dnetlib/dhp/sample/dataset/no_updates").getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - // "-preparedInfoPath", - // getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath() + getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates").getPath(), + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/dataset") @@ -134,34 +129,24 @@ public class BulkTagJobTest { @Test public void bulktagBySubjectNoPreviousContextTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext") + .getPath(); + final String pathMap = BulkTagJobTest.pathMap; SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/dataset") @@ -240,32 +225,22 @@ public class BulkTagJobTest { @Test public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception { + final String sourcePath = getClass() + .getResource( + "/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance") + .getPath(); + final String pathMap = BulkTagJobTest.pathMap; SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); @@ -332,34 +307,23 @@ public class BulkTagJobTest { @Test public void bulktagByDatasourceTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource") + .getPath(); SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/sample/publication/update_datasource") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Publication", - "-outputPath", - workingDir.toString() + "/publication", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-outputPath", workingDir.toString() + "/publication", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/publication") @@ -415,35 +379,24 @@ public class BulkTagJobTest { @Test public void bulktagByZenodoCommunityTest() throws Exception { + final String sourcePath = getClass() + .getResource( + "/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity") + .getPath(); SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct", - "-outputPath", - workingDir.toString() + "/orp", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct", + "-outputPath", workingDir.toString() + "/orp", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/orp") @@ -548,34 +501,23 @@ public class BulkTagJobTest { @Test public void bulktagBySubjectDatasourceTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource") + .getPath(); SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject_datasource") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/dataset") @@ -691,29 +633,17 @@ public class BulkTagJobTest { SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass().getResource("/eu/dnetlib/dhp/sample/software/").getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - "-outputPath", - workingDir.toString() + "/software", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass().getResource("/eu/dnetlib/dhp/bulktag/sample/software/").getPath(), + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-outputPath", workingDir.toString() + "/software", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/software") @@ -796,35 +726,24 @@ public class BulkTagJobTest { @Test public void bulktagDatasourcewithConstraintsTest() throws Exception { + final String sourcePath = getClass() + .getResource( + "/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints") + .getPath(); SparkBulkTagJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookUpUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-pathMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-taggingConf", taggingConf, + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-isLookUpUrl", MOCK_IS_LOOK_UP_URL, + "-pathMap", pathMap }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/dataset") diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/CommunityConfigurationFactoryTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/CommunityConfigurationFactoryTest.java new file mode 100644 index 000000000..ca737b79f --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/CommunityConfigurationFactoryTest.java @@ -0,0 +1,85 @@ + +package eu.dnetlib.dhp.bulktag; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.util.*; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.dom4j.DocumentException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import com.google.gson.Gson; + +import eu.dnetlib.dhp.bulktag.community.CommunityConfiguration; +import eu.dnetlib.dhp.bulktag.community.CommunityConfigurationFactory; +import eu.dnetlib.dhp.bulktag.community.Constraint; +import eu.dnetlib.dhp.bulktag.community.SelectionConstraints; +import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; + +/** Created by miriam on 03/08/2018. */ +public class CommunityConfigurationFactoryTest { + + private final VerbResolver resolver = new VerbResolver(); + + @Test + public void parseTest() throws DocumentException, IOException { + String xml = IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml")); + final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); + Assertions.assertEquals(5, cc.size()); + cc + .getCommunityList() + .forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId()))); + } + + @Test + public void applyVerb() + throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, + InstantiationException { + Constraint sc = new Constraint(); + sc.setVerb("not_contains"); + sc.setField("contributor"); + sc.setValue("DARIAH"); + sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue())); + String metadata = "This work has been partially supported by DARIAH-EU infrastructure"; + Assertions.assertFalse(sc.verifyCriteria(metadata)); + } + + @Test + public void loadSelCriteriaTest() throws DocumentException, IOException { + String xml = IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml")); + final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); + Map> param = new HashMap<>(); + param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi"))); + param + .put( + "description", + new ArrayList<>( + Collections + .singletonList( + "This work has been partially supported by DARIAH-EU infrastructure"))); + param + .put( + "contributor", + new ArrayList<>( + Collections + .singletonList( + "Author X helped to write the paper. X works for DARIAH"))); + List comm = cc + .getCommunityForDatasource( + "openaire____::1cfdb2e14977f31a98e0118283401f32", param); + Assertions.assertEquals(1, comm.size()); + Assertions.assertEquals("dariah", comm.get(0)); + } + +} diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java similarity index 89% rename from dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java index 2370d5e6c..88ad43b6b 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java @@ -5,12 +5,15 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; @@ -26,12 +29,11 @@ import eu.dnetlib.dhp.schema.oaf.Software; import scala.Tuple2; public class CountryPropagationJobTest { + private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = CountryPropagationJobTest.class.getClassLoader(); - private static SparkSession spark; private static Path workingDir; @@ -101,8 +103,9 @@ public class CountryPropagationJobTest { Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count()); Dataset countryExploded = verificationDs - .flatMap(row -> row.getCountry().iterator(), Encoders.bean(Country.class)) - .map(c -> c.getClassid(), Encoders.STRING()); + .flatMap( + (FlatMapFunction) row -> row.getCountry().iterator(), Encoders.bean(Country.class)) + .map((MapFunction) c -> c.getClassid(), Encoders.STRING()); Assertions.assertEquals(9, countryExploded.count()); @@ -115,20 +118,18 @@ public class CountryPropagationJobTest { Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count()); Dataset> countryExplodedWithCountryclassid = verificationDs - .flatMap( - row -> { - List> prova = new ArrayList(); - List country_list = row.getCountry(); - country_list - .stream() - .forEach( - c -> prova - .add( - new Tuple2<>( - row.getId(), c.getClassid()))); - return prova.iterator(); - }, - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + .flatMap((FlatMapFunction>) row -> { + List> prova = new ArrayList(); + List country_list = row.getCountry(); + country_list + .stream() + .forEach( + c -> prova + .add( + new Tuple2<>( + row.getId(), c.getClassid()))); + return prova.iterator(); + }, Encoders.tuple(Encoders.STRING(), Encoders.STRING())); Assertions.assertEquals(9, countryExplodedWithCountryclassid.count()); @@ -178,7 +179,7 @@ public class CountryPropagationJobTest { Dataset> countryExplodedWithCountryclassname = verificationDs .flatMap( - row -> { + (FlatMapFunction>) row -> { List> prova = new ArrayList(); List country_list = row.getCountry(); country_list @@ -239,7 +240,7 @@ public class CountryPropagationJobTest { Dataset> countryExplodedWithCountryProvenance = verificationDs .flatMap( - row -> { + (FlatMapFunction>) row -> { List> prova = new ArrayList(); List country_list = row.getCountry(); country_list diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java similarity index 98% rename from dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java index 0b0ec62d1..edd2e7ba7 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java @@ -29,8 +29,6 @@ public class OrcidPropagationJobTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = OrcidPropagationJobTest.class.getClassLoader(); - private static SparkSession spark; private static Path workingDir; diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java similarity index 65% rename from dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java index 7ed26b6b2..abed028e1 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java @@ -9,6 +9,7 @@ import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; @@ -29,8 +30,6 @@ public class ProjectPropagationJobTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = ProjectPropagationJobTest.class.getClassLoader(); - private static SparkSession spark; private static Path workingDir; @@ -72,34 +71,27 @@ public class ProjectPropagationJobTest { @Test public void NoUpdateTest() throws Exception { + final String potentialUpdateDate = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(); SparkResultToProjectThroughSemRelJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - // "-sourcePath", - // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-potentialUpdatePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") - .getPath(), + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-potentialUpdatePath", potentialUpdateDate, + "-alreadyLinkedPath", alreadyLinkedPath, }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/relation") @@ -115,34 +107,27 @@ public class ProjectPropagationJobTest { */ @Test public void UpdateTenTest() throws Exception { + final String potentialUpdatePath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(); SparkResultToProjectThroughSemRelJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - // "-sourcePath", - // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-potentialUpdatePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") - .getPath(), + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-potentialUpdatePath", potentialUpdatePath, + "-alreadyLinkedPath", alreadyLinkedPath, }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/relation") @@ -161,8 +146,8 @@ public class ProjectPropagationJobTest { 5, verificationDs .filter( - r -> r.getSource().substring(0, 2).equals("50") - && r.getTarget().substring(0, 2).equals("40") + (FilterFunction) r -> r.getSource().startsWith("50") + && r.getTarget().startsWith("40") && r.getRelClass().equals("isProducedBy")) .count()); Assertions @@ -170,8 +155,8 @@ public class ProjectPropagationJobTest { 5, verificationDs .filter( - r -> r.getSource().substring(0, 2).equals("40") - && r.getTarget().substring(0, 2).equals("50") + (FilterFunction) r -> r.getSource().startsWith("40") + && r.getTarget().startsWith("50") && r.getRelClass().equals("produces")) .count()); @@ -194,34 +179,27 @@ public class ProjectPropagationJobTest { */ @Test public void UpdateMixTest() throws Exception { + final String potentialUpdatepath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(); SparkResultToProjectThroughSemRelJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - // "-sourcePath", - // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-potentialUpdatePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") - .getPath(), + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-potentialUpdatePath", potentialUpdatepath, + "-alreadyLinkedPath", alreadyLinkedPath, }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/relation") @@ -243,8 +221,8 @@ public class ProjectPropagationJobTest { 4, verificationDs .filter( - r -> r.getSource().substring(0, 2).equals("50") - && r.getTarget().substring(0, 2).equals("40") + (FilterFunction) r -> r.getSource().startsWith("50") + && r.getTarget().startsWith("40") && r.getRelClass().equals("isProducedBy")) .count()); Assertions @@ -252,8 +230,8 @@ public class ProjectPropagationJobTest { 4, verificationDs .filter( - r -> r.getSource().substring(0, 2).equals("40") - && r.getTarget().substring(0, 2).equals("50") + (FilterFunction) r -> r.getSource().startsWith("40") + && r.getTarget().startsWith("50") && r.getRelClass().equals("produces")) .count()); diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java similarity index 89% rename from dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java index ba8fb0831..d739516fc 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java @@ -32,8 +32,6 @@ public class ResultToCommunityJobTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = ResultToCommunityJobTest.class.getClassLoader(); - private static SparkSession spark; private static Path workingDir; @@ -68,33 +66,25 @@ public class ResultToCommunityJobTest { @Test public void testSparkResultToCommunityFromOrganizationJob() throws Exception { + final String preparedInfoPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo") + .getPath(); SparkResultToCommunityFromOrganizationJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass() .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/sample") .getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-preparedInfoPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo") - .getPath() + "-hive_metastore_uris", "", + "-saveGraph", "true", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-preparedInfoPath", preparedInfoPath }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/dataset") @@ -217,13 +207,6 @@ public class ResultToCommunityJobTest { .get(0) .getString(0)); - /* - * {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::8d817039a63710fcf97e30f14662c6c8"} - * "context" ["id": euromarine] updates = 1 - * {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6"} context - * = [ni, euromarine] updates = 1 - */ - query = "select id, MyT.id community " + "from dataset " + "lateral view explode(context) c as MyT " diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java similarity index 88% rename from dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java index 13941b4a3..a8e1ab841 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java @@ -29,32 +29,21 @@ import eu.dnetlib.dhp.schema.oaf.Dataset; public class ResultToCommunityJobTest { - private static final Logger log = LoggerFactory - .getLogger( - eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class); + private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class - .getClassLoader(); - private static SparkSession spark; private static Path workingDir; @BeforeAll public static void beforeAll() throws IOException { - workingDir = Files - .createTempDirectory( - eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class - .getSimpleName()); + workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName()); log.info("using work dir {}", workingDir); SparkConf conf = new SparkConf(); - conf - .setAppName( - eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class - .getSimpleName()); + conf.setAppName(ResultToCommunityJobTest.class.getSimpleName()); conf.setMaster("local[*]"); conf.set("spark.driver.host", "localhost"); @@ -65,7 +54,7 @@ public class ResultToCommunityJobTest { spark = SparkSession .builder() - .appName(OrcidPropagationJobTest.class.getSimpleName()) + .appName(ResultToCommunityJobTest.class.getSimpleName()) .config(conf) .getOrCreate(); } @@ -83,22 +72,18 @@ public class ResultToCommunityJobTest { new String[] { "-isTest", Boolean.TRUE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample") + "-sourcePath", getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample") .getPath(), "-hive_metastore_uris", "", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", "-outputPath", workingDir.toString() + "/dataset", - "-preparedInfoPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo") + "-preparedInfoPath", getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo") .getPath() }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/dataset") diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java similarity index 63% rename from dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java rename to dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java index e7adb260e..435b76605 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultToOrganizationJobTest.java @@ -23,23 +23,19 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.oaf.Relation; -public class Result2OrganizationJobTest { +public class ResultToOrganizationJobTest { - private static final Logger log = LoggerFactory.getLogger(Result2OrganizationJobTest.class); + private static final Logger log = LoggerFactory.getLogger(ResultToOrganizationJobTest.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = Result2OrganizationJobTest.class.getClassLoader(); - private static SparkSession spark; private static Path workingDir; @BeforeAll public static void beforeAll() throws IOException { - workingDir = Files - .createTempDirectory( - SparkResultToOrganizationFromIstRepoJob.class.getSimpleName()); + workingDir = Files.createTempDirectory(SparkResultToOrganizationFromIstRepoJob.class.getSimpleName()); log.info("using work dir {}", workingDir); SparkConf conf = new SparkConf(); @@ -72,40 +68,31 @@ public class Result2OrganizationJobTest { */ @Test public void NoUpdateTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") + .getPath(); + final String datasourceOrganizationPath = getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked") + .getPath(); SparkResultToOrganizationFromIstRepoJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") - .getPath(), - "-hive_metastore_uris", - "", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-datasourceOrganizationPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked") - .getPath(), + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-datasourceOrganizationPath", datasourceOrganizationPath, + "-alreadyLinkedPath", alreadyLinkedPath, }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/relation") @@ -123,40 +110,31 @@ public class Result2OrganizationJobTest { */ @Test public void UpdateNoMixTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") + .getPath(); + final String datasourceOrganizationPath = getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked") + .getPath(); SparkResultToOrganizationFromIstRepoJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") - .getPath(), - "-hive_metastore_uris", - "", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-datasourceOrganizationPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked") - .getPath(), + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-datasourceOrganizationPath", datasourceOrganizationPath, + "-alreadyLinkedPath", alreadyLinkedPath, }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/relation") @@ -197,40 +175,31 @@ public class Result2OrganizationJobTest { @Test public void UpdateMixTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix") + .getPath(); + final String datasourceOrganizationPath = getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization") + .getPath(); + final String alreadyLinkedPath = getClass() + .getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked") + .getPath(); SparkResultToOrganizationFromIstRepoJob .main( new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix") - .getPath(), - "-hive_metastore_uris", - "", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-datasourceOrganizationPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked") - .getPath(), + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", sourcePath, + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Software", + "-saveGraph", "true", + "-outputPath", workingDir.toString() + "/relation", + "-datasourceOrganizationPath", datasourceOrganizationPath, + "-alreadyLinkedPath", alreadyLinkedPath, }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/relation") diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.json similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.json rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.json diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml similarity index 95% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml index 8fec18593..e2cc41063 100644 --- a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration.xml @@ -2,17 +2,17 @@ - + - + - + @@ -35,7 +35,7 @@ SDG9 - Industry innovation and infrastructure SDG16 - Peace justice and strong institutions - + 123 @@ -45,12 +45,12 @@ - + - + @@ -74,7 +74,7 @@ brain magnetic resonance imaging brain abnormalities - + re3data_____::5b9bf9171d92df854cf3c520692e9122 @@ -95,7 +95,7 @@ doajarticles::0c0e74daa5d95504eade9c81ebbd5b8a - + @@ -106,12 +106,12 @@ aqua sea - + re3data_____::9633d1e8c4309c833c2c442abeb0cfeb - + @@ -134,7 +134,7 @@ food distribution forestry - + opendoar____::1a551829d50f1400b0dab21fdd969c04 @@ -159,18 +159,18 @@ opendoar____::87ae6fb631f7c8a627e8e28785d9992d - + oac_clarin - + re3data_____::a507cdacc5bbcc08761c92185dee5cab - + \ No newline at end of file diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.json similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.json diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml similarity index 98% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml index ad31e1763..cd5ea38d0 100644 --- a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit.xml @@ -2,17 +2,17 @@ - + - + - + @@ -45,7 +45,7 @@ - + diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.json similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.json diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml similarity index 98% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml index 4f0d25f34..a44372e4d 100644 --- a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml @@ -9,17 +9,20 @@ - Result: 2 + + - Result: 3 + + - Result: 4 + + @@ -29,7 +32,8 @@ - Result: 5 + + @@ -39,7 +43,8 @@ - Result: 6 + + SDG13 - Climate action SDG8 - Decent work and economic growth @@ -62,7 +67,8 @@ - Result: 7 + + modern art monuments @@ -253,7 +259,8 @@ - Result: 8 + + Stock Assessment pelagic @@ -372,7 +379,8 @@ - Result: 9 + + brain mapping brain imaging @@ -486,7 +494,8 @@ - Result: 10 + + marine ocean @@ -686,7 +695,8 @@ - Result: 11 + + @@ -700,12 +710,14 @@ - Result: 12 + + - Result: 13 + + animal production and health fisheries and aquaculture @@ -827,7 +839,8 @@ - Result: 14 + + @@ -846,17 +859,20 @@ - Result: 15 + + - Result: 16 + + - Result: 17 + + Green Transport City mobility systems @@ -1154,22 +1170,26 @@ - Result: 18 + + - Result: 19 + + - Result: 20 + + - Result: 21 + + Sustainability-oriented science policy STI policies @@ -1272,7 +1292,8 @@ - Result: 22 + + COVID-19 Severe acute respiratory syndrome coronavirus 2 @@ -1387,4 +1408,4 @@ - + \ No newline at end of file diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/no_updates/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/no_updates/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/no_updates/dataset_10.json.gz diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_datasourcewithconstraints/dataset_10.json.gz diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/contextnoprovenance/dataset_10.json.gz diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/nocontext/dataset_10.json.gz diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject_datasource/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/update_subject_datasource/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/dataset/update_subject_datasource/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject/update_subject_datasource/dataset_10.json.gz diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz new file mode 100644 index 000000000..fdc76a04c Binary files /dev/null and b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/dataset/update_subject_datasource/dataset_10.json.gz differ diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/otherresearchproduct/update_zenodocommunity/otherresearchproduct_10.json.gz diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/publication/update_datasource/publication_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/publication_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/publication/update_datasource/publication_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/publication_10.json.gz diff --git a/dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/software/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/software/software_10.json.gz similarity index 100% rename from dhp-workflows/dhp-bulktag/src/test/resources/eu/dnetlib/dhp/sample/software/software_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/bulktag/sample/software/software_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/countrypropagation/preparedInfo/preparedInfo.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/preparedInfo/preparedInfo.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/countrypropagation/preparedInfo/preparedInfo.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/preparedInfo/preparedInfo.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/countrypropagation/sample/software/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/sample/software/software_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/countrypropagation/sample/software/software_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/countrypropagation/sample/software/software_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc/mergedOrcid_17.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc/mergedOrcid_17.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc/mergedOrcid_17.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc/mergedOrcid_17.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate/dataset_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate/dataset_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates/dataset_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked/alreadyLinked.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked/alreadyLinked.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked/alreadyLinked.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked/alreadyLinked.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates/potentialUpdates.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates/potentialUpdates.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates/potentialUpdates.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates/potentialUpdates.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates/potentialUpdates.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates/potentialUpdates.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates/potentialUpdates.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates/potentialUpdates.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates/potentialUpdates.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates/potentialUpdates.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates/potentialUpdates.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates/potentialUpdates.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo/resultCommunityList.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo/resultCommunityList.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo/resultCommunityList.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo/resultCommunityList.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/sample/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/sample/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/sample/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/sample/dataset_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo/mergedResultCommunityList.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo/mergedResultCommunityList.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo/mergedResultCommunityList.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo/mergedResultCommunityList.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample/dataset_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample/dataset_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample/dataset_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample/dataset_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix/software_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix/software_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix/software_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix/software_10.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix/software_10.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix/software_10.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix/software_10.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked/alreadyLinked_20.json.gz diff --git a/dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz similarity index 100% rename from dhp-workflows/dhp-propagation/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz rename to dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization/datasourceOrganization_28.json.gz diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java new file mode 100644 index 000000000..f88f7457f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.oa.graph.hive; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import static eu.dnetlib.dhp.schema.common.ModelSupport.tableIdentifier; + +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Oaf; + +public class GraphHiveTableImporterJob { + + private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GraphHiveTableImporterJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json"))); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + String hiveDbName = parser.get("hiveDbName"); + log.info("hiveDbName: {}", hiveDbName); + + final String className = parser.get("className"); + log.info("className: {}", className); + + Class clazz = (Class) Class.forName(className); + + String hiveMetastoreUris = parser.get("hiveMetastoreUris"); + log.info("hiveMetastoreUris: {}", hiveMetastoreUris); + + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", hiveMetastoreUris); + + runWithSparkHiveSession( + conf, isSparkSessionManaged, spark -> loadGraphTable(spark, inputPath, hiveDbName, clazz)); + } + + // protected for testing + private static void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName, + Class clazz) { + + spark + .read() + .textFile(inputPath) + .map((MapFunction) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(tableIdentifier(hiveDbName, clazz)); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index fd12716b4..b9c4e6c80 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -127,7 +127,6 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = new ArrayList<>(); switch (type.toLowerCase()) { - case "": case "publication": final Publication p = new Publication(); populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); @@ -138,7 +137,7 @@ public abstract class AbstractMdRecordToOafMapper { case "dataset": final Dataset d = new Dataset(); populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); + d.setResulttype(DATASET_DEFAULT_RESULTTYPE); d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setDevice(prepareDatasetDevice(doc, info)); d.setSize(prepareDatasetSize(doc, info)); @@ -158,6 +157,7 @@ public abstract class AbstractMdRecordToOafMapper { s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); oafs.add(s); break; + case "": case "otherresearchproducts": default: final OtherResearchProduct o = new OtherResearchProduct(); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index e5e348642..5b8296c19 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -50,8 +50,7 @@ import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -public class MigrateDbEntitiesApplication extends AbstractMigrationApplication - implements Closeable { +public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); @@ -128,9 +127,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication } public List processDatasource(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); final Datasource ds = new Datasource(); @@ -194,7 +191,6 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication public List processProject(final ResultSet rs) { try { - final DataInfo info = prepareDataInfo(rs); final Project p = new Project(); @@ -249,9 +245,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication } public List processOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); final Organization o = new Organization(); @@ -370,14 +364,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final DataInfo info = dataInfo( false, null, false, false, - qualifier(USER_CLAIM, USER_CLAIM, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS), "0.9"); final List collectedFrom = listKeyValues( createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); try { - if (rs.getString(SOURCE_TYPE).equals("context")) { final Result r; @@ -461,9 +453,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication final Boolean inferred = rs.getBoolean("inferred"); final String trust = rs.getString("trust"); return dataInfo( - - deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust); - + deletedbyinference, + inferenceprovenance, + inferred, + false, + ENTITYREGISTRY_PROVENANCE_ACTION, + trust); } private Qualifier prepareQualifierSplitting(final String s) { @@ -535,4 +530,5 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication super.close(); dbClient.close(); } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 891fee57e..54594cb80 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,8 +1,7 @@ package eu.dnetlib.dhp.oa.graph.raw; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import java.util.*; @@ -10,11 +9,13 @@ import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; +import org.dom4j.Element; import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; public class OafToOafMapper extends AbstractMdRecordToOafMapper { @@ -28,15 +29,26 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { final List res = new ArrayList<>(); int pos = 1; for (final Object o : doc.selectNodes("//dc:creator")) { - final Node n = (Node) o; + final Element e = (Element) o; final Author author = new Author(); - author.setFullname(n.getText()); + author.setFullname(e.getText()); author.setRank(pos++); - final PacePerson p = new PacePerson(n.getText(), false); + final PacePerson p = new PacePerson(e.getText(), false); if (p.isAccurate()) { author.setName(p.getNormalisedFirstName()); author.setSurname(p.getNormalisedSurname()); } + + final String pid = e.attributeValue("nameIdentifier"); + final String pidType = e.attributeValue("nameIdentifierScheme"); + + if (StringUtils.isNotBlank(pid) && StringUtils.isNotBlank(pidType)) { + author.setPid(new ArrayList<>()); + author + .getPid() + .add(structuredProperty(pid, qualifier(pidType, pidType, DNET_PID_TYPES, DNET_PID_TYPES), info)); + } + res.add(author); } return res; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 04984d008..30b980c42 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -12,6 +12,7 @@ import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; +import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -44,20 +45,35 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//datacite:creator")) { final Node n = (Node) o; final Author author = new Author(); - author.setFullname(n.valueOf("./datacite:creatorName")); - author.setName(n.valueOf("./datacite:givenName")); - author.setSurname(n.valueOf("./datacite:familyName")); - author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); - author.setPid(preparePids(doc, info)); + final String fullname = n.valueOf("./datacite:creatorName"); + author.setFullname(fullname); + + PacePerson pp = new PacePerson(fullname, false); + final String name = n.valueOf("./datacite:givenName"); + if (StringUtils.isBlank(name) & pp.isAccurate()) { + author.setName(pp.getNormalisedFirstName()); + } else { + author.setName(name); + } + + final String surname = n.valueOf("./datacite:familyName"); + if (StringUtils.isBlank(surname) & pp.isAccurate()) { + author.setSurname(pp.getNormalisedSurname()); + } else { + author.setSurname(surname); + } + + author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); + author.setPid(preparePids(n, info)); author.setRank(pos++); res.add(author); } return res; } - private List preparePids(final Document doc, final DataInfo info) { + private List preparePids(final Node n, final DataInfo info) { final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { + for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { res .add( structuredProperty( @@ -77,8 +93,6 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { final KeyValue hostedby) { final Instance instance = new Instance(); - final Set url = new HashSet<>(); - instance.setUrl(new ArrayList<>()); instance .setInstancetype( prepareQualifier( @@ -97,6 +111,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { .setProcessingchargecurrency( field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + final Set url = new HashSet<>(); for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { url.add(((Node) o).getText().trim()); } @@ -109,7 +124,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim()); } - instance.getUrl().addAll(url); + if (!url.isEmpty()) { + instance.setUrl(new ArrayList<>()); + instance.getUrl().addAll(url); + } return Arrays.asList(instance); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java index d1c615dcd..6e474f2f3 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.graph.raw.common; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.Normalizer; import java.util.HashSet; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql new file mode 100644 index 000000000..484afde80 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/lib/scripts/reset_db.sql @@ -0,0 +1,2 @@ +DROP DATABASE IF EXISTS ${hiveDbName} CASCADE; +CREATE DATABASE ${hiveDbName}; \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index e837ac6b3..8566d7667 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -72,18 +72,45 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + + + + + hive.metastore.uris + ${hiveMetastoreUris} + + + ${hiveJdbcUrl}/${hiveDbName} + + hiveDbName=${hiveDbName} + + + + + + + + + + + + + + + + + yarn cluster - MapGraphAsHiveDB - eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob + Import table publication + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob dhp-graph-mapper-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -95,18 +122,201 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --inputPath${inputPath} + --inputPath${inputPath}/publication --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Publication --hiveMetastoreUris${hiveMetastoreUris} - + + + + yarn + cluster + Import table dataset + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/dataset + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Dataset + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table otherresearchproduct + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/otherresearchproduct + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table software + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/software + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Software + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table datasource + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/datasource + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Datasource + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table organization + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/organization + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Organization + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table project + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/project + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Project + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + + yarn + cluster + Import table project + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${inputPath}/relation + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Relation + --hiveMetastoreUris${hiveMetastoreUris} + + + + + + + - ${jobTracker} - ${nameNode} hive.metastore.uris @@ -122,4 +332,5 @@ + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json new file mode 100644 index 000000000..d6c13773a --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_db_importer_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "in", + "paramLongName": "inputPath", + "paramDescription": "the path to the graph data dump to read", + "paramRequired": true + }, + { + "paramName": "hmu", + "paramLongName": "hiveMetastoreUris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "hiveDbName", + "paramDescription": "the target hive database name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json new file mode 100644 index 000000000..5b5b0743c --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive_table_importer_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "in", + "paramLongName": "inputPath", + "paramDescription": "the path to the graph data dump to read", + "paramRequired": true + }, + { + "paramName": "hmu", + "paramLongName": "hiveMetastoreUris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "db", + "paramLongName": "hiveDbName", + "paramDescription": "the target hive database name", + "paramRequired": true + }, + { + "paramName": "tn", + "paramLongName": "className", + "paramDescription": "the class modelling the target table", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json deleted file mode 100644 index c4910ec61..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_actionsets_parameters.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - {"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}, - {"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true}, - {"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true}, - {"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true}, - {"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true}, - {"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true}, - {"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true} -] diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json deleted file mode 100644 index 6fa10f739..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/transform_actionsets_parameters.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "is", - "paramLongName": "isLookupUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "i", - "paramLongName": "inputPaths", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 951c97d9d..5a006e351 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -10,6 +10,7 @@ import static org.mockito.Mockito.when; import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -19,11 +20,8 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; @ExtendWith(MockitoExtension.class) public class MappersTest { @@ -54,7 +52,29 @@ public class MappersTest { assertValidId(p.getId()); assertValidId(p.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertTrue(p.getAuthor().size() > 0); + Optional author = p + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); + assertTrue(author.isPresent()); + StructuredProperty pid = author + .get() + .getPid() + .stream() + .findFirst() + .get(); + assertEquals("0000-0001-6651-1178", pid.getValue()); + assertEquals("ORCID", pid.getQualifier().getClassid()); + assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); + assertEquals("Votsi,Nefta", author.get().getFullname()); + assertEquals("Votsi", author.get().getSurname()); + assertEquals("Nefta", author.get().getName()); + assertTrue(p.getSubject().size() > 0); assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); @@ -100,6 +120,38 @@ public class MappersTest { assertValidId(d.getCollectedfrom().get(0).getKey()); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(d.getAuthor().size() > 0); + + Optional author = d + .getAuthor() + .stream() + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) + .findFirst(); + assertTrue(author.isPresent()); + StructuredProperty pid = author + .get() + .getPid() + .stream() + .findFirst() + .get(); + assertEquals("0000-0001-9074-1619", pid.getValue()); + assertEquals("ORCID", pid.getQualifier().getClassid()); + assertEquals("ORCID", pid.getQualifier().getClassname()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid()); + assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename()); + assertEquals("Baracchini, Theo", author.get().getFullname()); + assertEquals("Baracchini", author.get().getSurname()); + assertEquals("Theo", author.get().getName()); + + assertEquals(1, author.get().getAffiliation().size()); + Optional> opAff = author + .get() + .getAffiliation() + .stream() + .findFirst(); + assertTrue(opAff.isPresent()); + Field affiliation = opAff.get(); + assertEquals("ISTI-CNR", affiliation.getValue()); + assertTrue(d.getSubject().size() > 0); assertTrue(d.getInstance().size() > 0); assertTrue(d.getContext().size() > 0); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index e898d4434..2cb0ba1c7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -19,7 +19,7 @@ Ecosystem Service capacity is higher in areas of multiple designation types Nikolaidou,Charitini - Votsi,Nefta + Votsi,Nefta Sgardelis,Steanos Halley,John Pantis,John diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml index 94dc802fa..88ae9d106 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_dataset.xml @@ -35,9 +35,10 @@ Baracchini, Theo + 0000-0001-9074-1619 Theo Baracchini - Physics of Aquatic Systems Laboratory (APHYS) – Margaretha Kamprad Chair, ENAC, EPFL, Lausanne, 1015, Switzerland + ISTI-CNR Wüest, Alfred diff --git a/dhp-workflows/dhp-propagation/pom.xml b/dhp-workflows/dhp-propagation/pom.xml deleted file mode 100644 index 9492fa7c5..000000000 --- a/dhp-workflows/dhp-propagation/pom.xml +++ /dev/null @@ -1,43 +0,0 @@ - - - - dhp-workflows - eu.dnetlib.dhp - 1.2.1-SNAPSHOT - - 4.0.0 - - dhp-propagation - - - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - eu.dnetlib.dhp - dhp-schemas - ${project.version} - - - org.apache.spark - spark-hive_2.11 - test - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml deleted file mode 100644 index dd7f25846..000000000 --- a/dhp-workflows/dhp-propagation/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml +++ /dev/null @@ -1,188 +0,0 @@ - - - - sourcePath - the source path - - - allowedsemrels - the allowed semantics - - - outputPath - the output path - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/publication - ${nameNode}/${outputPath}/publication - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/dataset - ${nameNode}/${outputPath}/dataset - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/otherresearchproduct - ${nameNode}/${outputPath}/otherresearchproduct - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/software - ${nameNode}/${outputPath}/software - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - - - - - - yarn - cluster - PrepareProjectResultsAssociation - eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation - dhp-propagation-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${sourcePath}/relation - --allowedsemrels${allowedsemrels} - --hive_metastore_uris${hive_metastore_uris} - --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked - - - - - - - - yarn - cluster - ProjectToResultPropagation - eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob - dhp-propagation-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - - --saveGraph${saveGraph} - --hive_metastore_uris${hive_metastore_uris} - --outputPath${outputPath}/relation - --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index badd8ca8a..271c66939 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -22,8 +22,7 @@ dhp-actionmanager dhp-graph-mapper dhp-dedup-openaire - dhp-bulktag - dhp-propagation + dhp-enrichment dhp-graph-provision dhp-dedup-scholexplorer dhp-graph-provision-scholexplorer