master #11

Manually merged
claudio.atzori merged 275 commits from :master into enrichment_wfs 2020-05-11 15:14:56 +02:00
4 changed files with 312 additions and 82 deletions
Showing only changes of commit 28556507e7 - Show all commits

View File

@ -1,4 +1,3 @@
package eu.dnetlib.dhp.schema.common; package eu.dnetlib.dhp.schema.common;
import java.util.Map; import java.util.Map;
@ -13,7 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class ModelSupport { public class ModelSupport {
/** Defines the mapping between the actual entity type and the main entity type */ /** Defines the mapping between the actual entity type and the main entity type */
private static final Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap(); private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
static { static {
entityMapping.put(EntityType.publication, MainEntityType.result); entityMapping.put(EntityType.publication, MainEntityType.result);
@ -53,6 +52,232 @@ public class ModelSupport {
oafTypes.put("relation", Relation.class); oafTypes.put("relation", Relation.class);
} }
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
static {
entityIdPrefix.put("datasource", "10");
entityIdPrefix.put("organization", "20");
entityIdPrefix.put("project", "40");
entityIdPrefix.put("result", "50");
}
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
static {
relationInverseMap
.put(
"personResult_authorship_isAuthorOf", new RelationInverse()
.setRelation("isAuthorOf")
.setInverse("hasAuthor")
.setRelType("personResult")
.setSubReltype("authorship"));
relationInverseMap
.put(
"personResult_authorship_hasAuthor", new RelationInverse()
.setInverse("isAuthorOf")
.setRelation("hasAuthor")
.setRelType("personResult")
.setSubReltype("authorship"));
relationInverseMap
.put(
"projectOrganization_participation_isParticipant", new RelationInverse()
.setRelation("isParticipant")
.setInverse("hasParticipant")
.setRelType("projectOrganization")
.setSubReltype("participation"));
relationInverseMap
.put(
"projectOrganization_participation_hasParticipant", new RelationInverse()
.setInverse("isParticipant")
.setRelation("hasParticipant")
.setRelType("projectOrganization")
.setSubReltype("participation"));
relationInverseMap
.put(
"resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
.setRelation("hasAuthorInstitution")
.setInverse("isAuthorInstitutionOf")
.setRelType("resultOrganization")
.setSubReltype("affiliation"));
relationInverseMap
.put(
"resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
.setInverse("hasAuthorInstitution")
.setRelation("isAuthorInstitutionOf")
.setRelType("resultOrganization")
.setSubReltype("affiliation"));
relationInverseMap
.put(
"organizationOrganization_dedup_merges", new RelationInverse()
.setRelation("merges")
.setInverse("isMergedIn")
.setRelType("organizationOrganization")
.setSubReltype("dedup"));
relationInverseMap
.put(
"organizationOrganization_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("organizationOrganization")
.setSubReltype("dedup"));
relationInverseMap
.put(
"organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("organizationOrganization")
.setSubReltype("dedupSimilarity"));
relationInverseMap
.put(
"resultProject_outcome_isProducedBy", new RelationInverse()
.setRelation("isProducedBy")
.setInverse("produces")
.setRelType("resultProject")
.setSubReltype("outcome"));
relationInverseMap
.put(
"resultProject_outcome_produces", new RelationInverse()
.setInverse("isProducedBy")
.setRelation("produces")
.setRelType("resultProject")
.setSubReltype("outcome"));
relationInverseMap
.put(
"projectPerson_contactPerson_isContact", new RelationInverse()
.setRelation("isContact")
.setInverse("hasContact")
.setRelType("projectPerson")
.setSubReltype("contactPerson"));
relationInverseMap
.put(
"projectPerson_contactPerson_hasContact", new RelationInverse()
.setInverse("isContact")
.setRelation("hasContact")
.setRelType("personPerson")
.setSubReltype("coAuthorship"));
relationInverseMap
.put(
"personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
.setInverse("isCoAuthorOf")
.setRelation("isCoAuthorOf")
.setRelType("personPerson")
.setSubReltype("coAuthorship"));
relationInverseMap
.put(
"personPerson_dedup_merges", new RelationInverse()
.setInverse("isMergedIn")
.setRelation("merges")
.setRelType("personPerson")
.setSubReltype("dedup"));
relationInverseMap
.put(
"personPerson_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("personPerson")
.setSubReltype("dedup"));
relationInverseMap
.put(
"personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("personPerson")
.setSubReltype("dedupSimilarity"));
relationInverseMap
.put(
"datasourceOrganization_provision_isProvidedBy", new RelationInverse()
.setInverse("provides")
.setRelation("isProvidedBy")
.setRelType("datasourceOrganization")
.setSubReltype("provision"));
relationInverseMap
.put(
"datasourceOrganization_provision_provides", new RelationInverse()
.setInverse("isProvidedBy")
.setRelation("provides")
.setRelType("datasourceOrganization")
.setSubReltype("provision"));
relationInverseMap
.put(
"resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("isAmongTopNSimilarDocuments")
.setRelation("hasAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("hasAmongTopNSimilarDocuments")
.setRelation("isAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_relationship_isRelatedTo", new RelationInverse()
.setInverse("isRelatedTo")
.setRelation("isRelatedTo")
.setRelType("resultResult")
.setSubReltype("relationship"));
relationInverseMap
.put(
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
.setInverse("hasAmongTopNSimilarDocuments")
.setRelation("isAmongTopNSimilarDocuments")
.setRelType("resultResult")
.setSubReltype("similarity"));
relationInverseMap
.put(
"resultResult_supplement_isSupplementTo", new RelationInverse()
.setInverse("isSupplementedBy")
.setRelation("isSupplementTo")
.setRelType("resultResult")
.setSubReltype("supplement"));
relationInverseMap
.put(
"resultResult_supplement_isSupplementedBy", new RelationInverse()
.setInverse("isSupplementTo")
.setRelation("isSupplementedBy")
.setRelType("resultResult")
.setSubReltype("supplement"));
relationInverseMap
.put(
"resultResult_part_isPartOf", new RelationInverse()
.setInverse("hasPart")
.setRelation("isPartOf")
.setRelType("resultResult")
.setSubReltype("part"));
relationInverseMap
.put(
"resultResult_part_hasPart", new RelationInverse()
.setInverse("isPartOf")
.setRelation("hasPart")
.setRelType("resultResult")
.setSubReltype("part"));
relationInverseMap
.put(
"resultResult_dedup_merges", new RelationInverse()
.setInverse("isMergedIn")
.setRelation("merges")
.setRelType("resultResult")
.setSubReltype("dedup"));
relationInverseMap
.put(
"resultResult_dedup_isMergedIn", new RelationInverse()
.setInverse("merges")
.setRelation("isMergedIn")
.setRelType("resultResult")
.setSubReltype("dedup"));
relationInverseMap
.put(
"resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
.setInverse("isSimilarTo")
.setRelation("isSimilarTo")
.setRelType("resultResult")
.setSubReltype("dedupSimilarity"));
}
private static final String schemeTemplate = "dnet:%s_%s_relations"; private static final String schemeTemplate = "dnet:%s_%s_relations";
private ModelSupport() { private ModelSupport() {
@ -202,5 +427,4 @@ public class ModelSupport {
private static <T extends Oaf> String idFnForOafEntity(T t) { private static <T extends Oaf> String idFnForOafEntity(T t) {
return ((OafEntity) t).getId(); return ((OafEntity) t).getId();
} }
} }

View File

@ -66,19 +66,25 @@ public class PrepareMergedRelationJob {
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) { private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
Dataset<Relation> relation = readRelations(spark, inputPath); Dataset<Relation> relation = readRelations(spark, inputPath);
relation.createOrReplaceTempView("relation");
spark relation.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
.sql(
"Select * from relation " +
"where relclass = 'merges' " +
"and datainfo.deletedbyinference = false")
.as(Encoders.bean(Relation.class))
.toJSON()
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression","gizp")
.text(outputPath); .json(outputPath);
// relation.createOrReplaceTempView("relation");
//
// spark
// .sql(
// "Select * from relation " +
// "where relclass = 'merges' " +
// "and datainfo.deletedbyinference = false")
// .as(Encoders.bean(Relation.class))
// .toJSON()
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .text(outputPath);
} }
public static org.apache.spark.sql.Dataset<Relation> readRelations( public static org.apache.spark.sql.Dataset<Relation> readRelations(

View File

@ -53,7 +53,7 @@ public class ReadBlacklistFromDB implements Closeable {
final String dbUrl = parser.get("postgresUrl"); final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser"); final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword"); final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath"); final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
final String hdfsNameNode = parser.get("hdfsNameNode"); final String hdfsNameNode = parser.get("hdfsNameNode");
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser, try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,

View File

@ -72,7 +72,7 @@ public class SparkRemoveBlacklistedRelationJob {
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath, private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
String outputPath, String mergesPath) { String outputPath, String mergesPath) {
Dataset<Relation> blackListed = readRelations(spark, blacklistPath); Dataset<Relation> blackListed = readRelations(spark, blacklistPath + "/blacklist");
Dataset<Relation> inputRelation = readRelations(spark, inputPath); Dataset<Relation> inputRelation = readRelations(spark, inputPath);
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath); Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);