forked from D-Net/dnet-hadoop
This commit is contained in:
parent
4c94231cad
commit
28556507e7
|
@ -1,4 +1,3 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.common;
|
||||
|
||||
import java.util.Map;
|
||||
|
@ -13,7 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
|||
public class ModelSupport {
|
||||
|
||||
/** Defines the mapping between the actual entity type and the main entity type */
|
||||
private static final Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
entityMapping.put(EntityType.publication, MainEntityType.result);
|
||||
|
@ -53,6 +52,232 @@ public class ModelSupport {
|
|||
oafTypes.put("relation", Relation.class);
|
||||
}
|
||||
|
||||
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
entityIdPrefix.put("datasource", "10");
|
||||
entityIdPrefix.put("organization", "20");
|
||||
entityIdPrefix.put("project", "40");
|
||||
entityIdPrefix.put("result", "50");
|
||||
}
|
||||
|
||||
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personResult_authorship_isAuthorOf", new RelationInverse()
|
||||
.setRelation("isAuthorOf")
|
||||
.setInverse("hasAuthor")
|
||||
.setRelType("personResult")
|
||||
.setSubReltype("authorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personResult_authorship_hasAuthor", new RelationInverse()
|
||||
.setInverse("isAuthorOf")
|
||||
.setRelation("hasAuthor")
|
||||
.setRelType("personResult")
|
||||
.setSubReltype("authorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectOrganization_participation_isParticipant", new RelationInverse()
|
||||
.setRelation("isParticipant")
|
||||
.setInverse("hasParticipant")
|
||||
.setRelType("projectOrganization")
|
||||
.setSubReltype("participation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectOrganization_participation_hasParticipant", new RelationInverse()
|
||||
.setInverse("isParticipant")
|
||||
.setRelation("hasParticipant")
|
||||
.setRelType("projectOrganization")
|
||||
.setSubReltype("participation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
|
||||
.setRelation("hasAuthorInstitution")
|
||||
.setInverse("isAuthorInstitutionOf")
|
||||
.setRelType("resultOrganization")
|
||||
.setSubReltype("affiliation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
|
||||
.setInverse("hasAuthorInstitution")
|
||||
.setRelation("isAuthorInstitutionOf")
|
||||
.setRelType("resultOrganization")
|
||||
.setSubReltype("affiliation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"organizationOrganization_dedup_merges", new RelationInverse()
|
||||
.setRelation("merges")
|
||||
.setInverse("isMergedIn")
|
||||
.setRelType("organizationOrganization")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"organizationOrganization_dedup_isMergedIn", new RelationInverse()
|
||||
.setInverse("merges")
|
||||
.setRelation("isMergedIn")
|
||||
.setRelType("organizationOrganization")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||
.setInverse("isSimilarTo")
|
||||
.setRelation("isSimilarTo")
|
||||
.setRelType("organizationOrganization")
|
||||
.setSubReltype("dedupSimilarity"));
|
||||
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultProject_outcome_isProducedBy", new RelationInverse()
|
||||
.setRelation("isProducedBy")
|
||||
.setInverse("produces")
|
||||
.setRelType("resultProject")
|
||||
.setSubReltype("outcome"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultProject_outcome_produces", new RelationInverse()
|
||||
.setInverse("isProducedBy")
|
||||
.setRelation("produces")
|
||||
.setRelType("resultProject")
|
||||
.setSubReltype("outcome"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectPerson_contactPerson_isContact", new RelationInverse()
|
||||
.setRelation("isContact")
|
||||
.setInverse("hasContact")
|
||||
.setRelType("projectPerson")
|
||||
.setSubReltype("contactPerson"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectPerson_contactPerson_hasContact", new RelationInverse()
|
||||
.setInverse("isContact")
|
||||
.setRelation("hasContact")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("coAuthorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
|
||||
.setInverse("isCoAuthorOf")
|
||||
.setRelation("isCoAuthorOf")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("coAuthorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_dedup_merges", new RelationInverse()
|
||||
.setInverse("isMergedIn")
|
||||
.setRelation("merges")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_dedup_isMergedIn", new RelationInverse()
|
||||
.setInverse("merges")
|
||||
.setRelation("isMergedIn")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||
.setInverse("isSimilarTo")
|
||||
.setRelation("isSimilarTo")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("dedupSimilarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"datasourceOrganization_provision_isProvidedBy", new RelationInverse()
|
||||
.setInverse("provides")
|
||||
.setRelation("isProvidedBy")
|
||||
.setRelType("datasourceOrganization")
|
||||
.setSubReltype("provision"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"datasourceOrganization_provision_provides", new RelationInverse()
|
||||
.setInverse("isProvidedBy")
|
||||
.setRelation("provides")
|
||||
.setRelType("datasourceOrganization")
|
||||
.setSubReltype("provision"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
|
||||
.setInverse("isAmongTopNSimilarDocuments")
|
||||
.setRelation("hasAmongTopNSimilarDocuments")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("similarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||
.setInverse("hasAmongTopNSimilarDocuments")
|
||||
.setRelation("isAmongTopNSimilarDocuments")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("similarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_relationship_isRelatedTo", new RelationInverse()
|
||||
.setInverse("isRelatedTo")
|
||||
.setRelation("isRelatedTo")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("relationship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||
.setInverse("hasAmongTopNSimilarDocuments")
|
||||
.setRelation("isAmongTopNSimilarDocuments")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("similarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_supplement_isSupplementTo", new RelationInverse()
|
||||
.setInverse("isSupplementedBy")
|
||||
.setRelation("isSupplementTo")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("supplement"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_supplement_isSupplementedBy", new RelationInverse()
|
||||
.setInverse("isSupplementTo")
|
||||
.setRelation("isSupplementedBy")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("supplement"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_part_isPartOf", new RelationInverse()
|
||||
.setInverse("hasPart")
|
||||
.setRelation("isPartOf")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("part"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_part_hasPart", new RelationInverse()
|
||||
.setInverse("isPartOf")
|
||||
.setRelation("hasPart")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("part"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_dedup_merges", new RelationInverse()
|
||||
.setInverse("isMergedIn")
|
||||
.setRelation("merges")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_dedup_isMergedIn", new RelationInverse()
|
||||
.setInverse("merges")
|
||||
.setRelation("isMergedIn")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||
.setInverse("isSimilarTo")
|
||||
.setRelation("isSimilarTo")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("dedupSimilarity"));
|
||||
|
||||
}
|
||||
|
||||
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
||||
|
||||
private ModelSupport() {
|
||||
|
@ -202,5 +427,4 @@ public class ModelSupport {
|
|||
private static <T extends Oaf> String idFnForOafEntity(T t) {
|
||||
return ((OafEntity) t).getId();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -66,19 +66,25 @@ public class PrepareMergedRelationJob {
|
|||
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
|
||||
|
||||
Dataset<Relation> relation = readRelations(spark, inputPath);
|
||||
relation.createOrReplaceTempView("relation");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"Select * from relation " +
|
||||
"where relclass = 'merges' " +
|
||||
"and datainfo.deletedbyinference = false")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.toJSON()
|
||||
relation.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(outputPath);
|
||||
.option("compression","gizp")
|
||||
.json(outputPath);
|
||||
// relation.createOrReplaceTempView("relation");
|
||||
//
|
||||
// spark
|
||||
// .sql(
|
||||
// "Select * from relation " +
|
||||
// "where relclass = 'merges' " +
|
||||
// "and datainfo.deletedbyinference = false")
|
||||
// .as(Encoders.bean(Relation.class))
|
||||
// .toJSON()
|
||||
// .write()
|
||||
// .mode(SaveMode.Overwrite)
|
||||
// .option("compression", "gzip")
|
||||
// .text(outputPath);
|
||||
}
|
||||
|
||||
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
||||
|
|
|
@ -53,7 +53,7 @@ public class ReadBlacklistFromDB implements Closeable {
|
|||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
|
||||
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
||||
|
|
|
@ -72,7 +72,7 @@ public class SparkRemoveBlacklistedRelationJob {
|
|||
|
||||
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
||||
String outputPath, String mergesPath) {
|
||||
Dataset<Relation> blackListed = readRelations(spark, blacklistPath);
|
||||
Dataset<Relation> blackListed = readRelations(spark, blacklistPath + "/blacklist");
|
||||
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
||||
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
||||
|
||||
|
|
Loading…
Reference in New Issue