This commit is contained in:
Miriam Baglioni 2020-05-08 12:54:52 +02:00
parent 4c94231cad
commit 28556507e7
4 changed files with 312 additions and 82 deletions

View File

@ -1,4 +1,3 @@
package eu.dnetlib.dhp.schema.common; package eu.dnetlib.dhp.schema.common;
import java.util.Map; import java.util.Map;
@ -13,7 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class ModelSupport { public class ModelSupport {
/** Defines the mapping between the actual entity type and the main entity type */ /** Defines the mapping between the actual entity type and the main entity type */
private static final Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap(); private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
static { static {
entityMapping.put(EntityType.publication, MainEntityType.result); entityMapping.put(EntityType.publication, MainEntityType.result);
@ -53,6 +52,232 @@ public class ModelSupport {
oafTypes.put("relation", Relation.class); oafTypes.put("relation", Relation.class);
} }
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
static {
entityIdPrefix.put("datasource", "10");
entityIdPrefix.put("organization", "20");
entityIdPrefix.put("project", "40");
entityIdPrefix.put("result", "50");
}
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
static {
relationInverseMap
.put(
"personResult_authorship_isAuthorOf", new RelationInverse()
.setRelation("isAuthorOf")
.setInverse("hasAuthor")
.setRelType("personResult")
.setSubReltype("authorship"));
relationInverseMap
.put(
"personResult_authorship_hasAuthor", new RelationInverse()
.setInverse("isAuthorOf")
.setRelation("hasAuthor")
.setRelType("personResult")
.setSubReltype("authorship"));
relationInverseMap
.put(
"projectOrganization_participation_isParticipant", new RelationInverse()
.setRelation("isParticipant")
.setInverse("hasParticipant")
.setRelType("projectOrganization")
.setSubReltype("participation"));
relationInverseMap
.put(
"projectOrganization_participation_hasParticipant", new RelationInverse()
.setInverse("isParticipant")
.setRelation("hasParticipant")
.setRelType("projectOrganization")
.setSubReltype("participation"));
relationInverseMap
.put(
"resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
.setRelation("hasAuthorInstitution")
.setInverse("isAuthorInstitutionOf")
.setRelType("resultOrganization")
.setSubReltype("affiliation"));
relationInverseMap
.put(
"resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
.setInverse("hasAuthorInstitution")
.setRelation("isAuthorInstitutionOf")
.setRelType("resultOrganization")
.setSubReltype("affiliation"));
relationInverseMap
.put(
"organizationOrganization_dedup_merges", new RelationInverse()
.setRelation("merges")
.setInverse("isMergedIn")
.setRelType("organizationOrganization")
.setSubReltype("dedup"));
relationInverseMap
.put(
"organizationOrganization_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("organizationOrganization")
.setSubReltype("dedup"));
relationInverseMap
.put(
"organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("organizationOrganization")
.setSubReltype("dedupSimilarity"));
relationInverseMap
.put(
"resultProject_outcome_isProducedBy", new RelationInverse()
.setRelation("isProducedBy")
.setInverse("produces")
.setRelType("resultProject")
.setSubReltype("outcome"));
relationInverseMap
.put(
"resultProject_outcome_produces", new RelationInverse()
.setInverse("isProducedBy")
.setRelation("produces")
.setRelType("resultProject")
.setSubReltype("outcome"));
relationInverseMap
.put(
"projectPerson_contactPerson_isContact", new RelationInverse()
.setRelation("isContact")
.setInverse("hasContact")
.setRelType("projectPerson")
.setSubReltype("contactPerson"));
relationInverseMap
.put(
"projectPerson_contactPerson_hasContact", new RelationInverse()
.setInverse("isContact")
.setRelation("hasContact")
.setRelType("personPerson")
.setSubReltype("coAuthorship"));
relationInverseMap
.put(
"personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
.setInverse("isCoAuthorOf")
.setRelation("isCoAuthorOf")
.setRelType("personPerson")
.setSubReltype("coAuthorship"));
relationInverseMap
.put(
"personPerson_dedup_merges", new RelationInverse()
.setInverse("isMergedIn")
.setRelation("merges")
.setRelType("personPerson")
.setSubReltype("dedup"));
relationInverseMap
.put(
"personPerson_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("personPerson")
.setSubReltype("dedup"));
relationInverseMap
.put(
"personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("personPerson")
.setSubReltype("dedupSimilarity"));
relationInverseMap
.put(
"datasourceOrganization_provision_isProvidedBy", new RelationInverse()
.setInverse("provides")
.setRelation("isProvidedBy")
.setRelType("datasourceOrganization")
.setSubReltype("provision"));
relationInverseMap
.put(
"datasourceOrganization_provision_provides", new RelationInverse()
.setInverse("isProvidedBy")
.setRelation("provides")
.setRelType("datasourceOrganization")
.setSubReltype("provision"));
relationInverseMap
.put(
"resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("isAmongTopNSimilarDocuments")
.setRelation("hasAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("hasAmongTopNSimilarDocuments")
.setRelation("isAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_relationship_isRelatedTo", new RelationInverse()
.setInverse("isRelatedTo")
.setRelation("isRelatedTo")
.setRelType("resultResult")
.setSubReltype("relationship"));
relationInverseMap
.put(
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("hasAmongTopNSimilarDocuments")
.setRelation("isAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_supplement_isSupplementTo", new RelationInverse()
.setInverse("isSupplementedBy")
.setRelation("isSupplementTo")
.setRelType("resultResult")
.setSubReltype("supplement"));
relationInverseMap
.put(
"resultResult_supplement_isSupplementedBy", new RelationInverse()
.setInverse("isSupplementTo")
.setRelation("isSupplementedBy")
.setRelType("resultResult")
.setSubReltype("supplement"));
relationInverseMap
.put(
"resultResult_part_isPartOf", new RelationInverse()
.setInverse("hasPart")
.setRelation("isPartOf")
.setRelType("resultResult")
.setSubReltype("part"));
relationInverseMap
.put(
"resultResult_part_hasPart", new RelationInverse()
.setInverse("isPartOf")
.setRelation("hasPart")
.setRelType("resultResult")
.setSubReltype("part"));
relationInverseMap
.put(
"resultResult_dedup_merges", new RelationInverse()
.setInverse("isMergedIn")
.setRelation("merges")
.setRelType("resultResult")
.setSubReltype("dedup"));
relationInverseMap
.put(
"resultResult_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("resultResult")
.setSubReltype("dedup"));
relationInverseMap
.put(
"resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("resultResult")
.setSubReltype("dedupSimilarity"));
}
private static final String schemeTemplate = "dnet:%s_%s_relations"; private static final String schemeTemplate = "dnet:%s_%s_relations";
private ModelSupport() { private ModelSupport() {
@ -68,7 +293,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y * @return True if X is a subclass of Y
*/ */
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass( public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
X subClazzObject, Y superClazzObject) { X subClazzObject, Y superClazzObject) {
return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
} }
@ -82,7 +307,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y * @return True if X is a subclass of Y
*/ */
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass( public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
X subClazzObject, Class<Y> superClazz) { X subClazzObject, Class<Y> superClazz) {
return isSubClass(subClazzObject.getClass(), superClazz); return isSubClass(subClazzObject.getClass(), superClazz);
} }
@ -96,7 +321,7 @@ public class ModelSupport {
* @return True if X is a subclass of Y * @return True if X is a subclass of Y
*/ */
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass( public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
Class<X> subClazz, Class<Y> superClazz) { Class<X> subClazz, Class<Y> superClazz) {
return superClazz.isAssignableFrom(subClazz); return superClazz.isAssignableFrom(subClazz);
} }
@ -108,32 +333,32 @@ public class ModelSupport {
*/ */
public static <T extends Oaf> Class<T>[] getOafModelClasses() { public static <T extends Oaf> Class<T>[] getOafModelClasses() {
return new Class[] { return new Class[] {
Author.class, Author.class,
Context.class, Context.class,
Country.class, Country.class,
DataInfo.class, DataInfo.class,
Dataset.class, Dataset.class,
Datasource.class, Datasource.class,
ExternalReference.class, ExternalReference.class,
ExtraInfo.class, ExtraInfo.class,
Field.class, Field.class,
GeoLocation.class, GeoLocation.class,
Instance.class, Instance.class,
Journal.class, Journal.class,
KeyValue.class, KeyValue.class,
Oaf.class, Oaf.class,
OafEntity.class, OafEntity.class,
OAIProvenance.class, OAIProvenance.class,
Organization.class, Organization.class,
OriginDescription.class, OriginDescription.class,
OtherResearchProduct.class, OtherResearchProduct.class,
Project.class, Project.class,
Publication.class, Publication.class,
Qualifier.class, Qualifier.class,
Relation.class, Relation.class,
Result.class, Result.class,
Software.class, Software.class,
StructuredProperty.class StructuredProperty.class
}; };
} }
@ -147,10 +372,10 @@ public class ModelSupport {
public static String getScheme(final String sourceType, final String targetType) { public static String getScheme(final String sourceType, final String targetType) {
return String return String
.format( .format(
schemeTemplate, schemeTemplate,
entityMapping.get(EntityType.valueOf(sourceType)).name(), entityMapping.get(EntityType.valueOf(sourceType)).name(),
entityMapping.get(EntityType.valueOf(targetType)).name()); entityMapping.get(EntityType.valueOf(targetType)).name());
} }
public static <T extends Oaf> Function<T, String> idFn() { public static <T extends Oaf> Function<T, String> idFn() {
@ -165,42 +390,41 @@ public class ModelSupport {
private static <T extends Oaf> String idFnForRelation(T t) { private static <T extends Oaf> String idFnForRelation(T t) {
Relation r = (Relation) t; Relation r = (Relation) t;
return Optional return Optional
.ofNullable(r.getSource()) .ofNullable(r.getSource())
.map( .map(
source -> Optional source -> Optional
.ofNullable(r.getTarget()) .ofNullable(r.getTarget())
.map( .map(
target -> Optional target -> Optional
.ofNullable(r.getRelType()) .ofNullable(r.getRelType())
.map( .map(
relType -> Optional relType -> Optional
.ofNullable(r.getSubRelType()) .ofNullable(r.getSubRelType())
.map( .map(
subRelType -> Optional subRelType -> Optional
.ofNullable(r.getRelClass()) .ofNullable(r.getRelClass())
.map( .map(
relClass -> String relClass -> String
.join( .join(
source, source,
target, target,
relType, relType,
subRelType, subRelType,
relClass)) relClass))
.orElse( .orElse(
String String
.join( .join(
source, source,
target, target,
relType, relType,
subRelType))) subRelType)))
.orElse(String.join(source, target, relType))) .orElse(String.join(source, target, relType)))
.orElse(String.join(source, target))) .orElse(String.join(source, target)))
.orElse(source)) .orElse(source))
.orElse(null); .orElse(null);
} }
private static <T extends Oaf> String idFnForOafEntity(T t) { private static <T extends Oaf> String idFnForOafEntity(T t) {
return ((OafEntity) t).getId(); return ((OafEntity) t).getId();
} }
} }

View File

@ -66,19 +66,25 @@ public class PrepareMergedRelationJob {
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) { private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
Dataset<Relation> relation = readRelations(spark, inputPath); Dataset<Relation> relation = readRelations(spark, inputPath);
relation.createOrReplaceTempView("relation");
spark relation.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
.sql( .write()
"Select * from relation " + .mode(SaveMode.Overwrite)
"where relclass = 'merges' " + .option("compression","gizp")
"and datainfo.deletedbyinference = false") .json(outputPath);
.as(Encoders.bean(Relation.class)) // relation.createOrReplaceTempView("relation");
.toJSON() //
.write() // spark
.mode(SaveMode.Overwrite) // .sql(
.option("compression", "gzip") // "Select * from relation " +
.text(outputPath); // "where relclass = 'merges' " +
// "and datainfo.deletedbyinference = false")
// .as(Encoders.bean(Relation.class))
// .toJSON()
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .text(outputPath);
} }
public static org.apache.spark.sql.Dataset<Relation> readRelations( public static org.apache.spark.sql.Dataset<Relation> readRelations(

View File

@ -53,7 +53,7 @@ public class ReadBlacklistFromDB implements Closeable {
final String dbUrl = parser.get("postgresUrl"); final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser"); final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword"); final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath"); final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
final String hdfsNameNode = parser.get("hdfsNameNode"); final String hdfsNameNode = parser.get("hdfsNameNode");
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser, try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,

View File

@ -72,7 +72,7 @@ public class SparkRemoveBlacklistedRelationJob {
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath, private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
String outputPath, String mergesPath) { String outputPath, String mergesPath) {
Dataset<Relation> blackListed = readRelations(spark, blacklistPath); Dataset<Relation> blackListed = readRelations(spark, blacklistPath + "/blacklist");
Dataset<Relation> inputRelation = readRelations(spark, inputPath); Dataset<Relation> inputRelation = readRelations(spark, inputPath);
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath); Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);