forked from antonis.lempesis/dnet-hadoop
This commit is contained in:
parent
4c94231cad
commit
28556507e7
|
@ -1,4 +1,3 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.common;
|
||||
|
||||
import java.util.Map;
|
||||
|
@ -13,7 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
|||
public class ModelSupport {
|
||||
|
||||
/** Defines the mapping between the actual entity type and the main entity type */
|
||||
private static final Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
entityMapping.put(EntityType.publication, MainEntityType.result);
|
||||
|
@ -53,6 +52,232 @@ public class ModelSupport {
|
|||
oafTypes.put("relation", Relation.class);
|
||||
}
|
||||
|
||||
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
entityIdPrefix.put("datasource", "10");
|
||||
entityIdPrefix.put("organization", "20");
|
||||
entityIdPrefix.put("project", "40");
|
||||
entityIdPrefix.put("result", "50");
|
||||
}
|
||||
|
||||
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personResult_authorship_isAuthorOf", new RelationInverse()
|
||||
.setRelation("isAuthorOf")
|
||||
.setInverse("hasAuthor")
|
||||
.setRelType("personResult")
|
||||
.setSubReltype("authorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personResult_authorship_hasAuthor", new RelationInverse()
|
||||
.setInverse("isAuthorOf")
|
||||
.setRelation("hasAuthor")
|
||||
.setRelType("personResult")
|
||||
.setSubReltype("authorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectOrganization_participation_isParticipant", new RelationInverse()
|
||||
.setRelation("isParticipant")
|
||||
.setInverse("hasParticipant")
|
||||
.setRelType("projectOrganization")
|
||||
.setSubReltype("participation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectOrganization_participation_hasParticipant", new RelationInverse()
|
||||
.setInverse("isParticipant")
|
||||
.setRelation("hasParticipant")
|
||||
.setRelType("projectOrganization")
|
||||
.setSubReltype("participation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
|
||||
.setRelation("hasAuthorInstitution")
|
||||
.setInverse("isAuthorInstitutionOf")
|
||||
.setRelType("resultOrganization")
|
||||
.setSubReltype("affiliation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
|
||||
.setInverse("hasAuthorInstitution")
|
||||
.setRelation("isAuthorInstitutionOf")
|
||||
.setRelType("resultOrganization")
|
||||
.setSubReltype("affiliation"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"organizationOrganization_dedup_merges", new RelationInverse()
|
||||
.setRelation("merges")
|
||||
.setInverse("isMergedIn")
|
||||
.setRelType("organizationOrganization")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"organizationOrganization_dedup_isMergedIn", new RelationInverse()
|
||||
.setInverse("merges")
|
||||
.setRelation("isMergedIn")
|
||||
.setRelType("organizationOrganization")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||
.setInverse("isSimilarTo")
|
||||
.setRelation("isSimilarTo")
|
||||
.setRelType("organizationOrganization")
|
||||
.setSubReltype("dedupSimilarity"));
|
||||
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultProject_outcome_isProducedBy", new RelationInverse()
|
||||
.setRelation("isProducedBy")
|
||||
.setInverse("produces")
|
||||
.setRelType("resultProject")
|
||||
.setSubReltype("outcome"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultProject_outcome_produces", new RelationInverse()
|
||||
.setInverse("isProducedBy")
|
||||
.setRelation("produces")
|
||||
.setRelType("resultProject")
|
||||
.setSubReltype("outcome"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectPerson_contactPerson_isContact", new RelationInverse()
|
||||
.setRelation("isContact")
|
||||
.setInverse("hasContact")
|
||||
.setRelType("projectPerson")
|
||||
.setSubReltype("contactPerson"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"projectPerson_contactPerson_hasContact", new RelationInverse()
|
||||
.setInverse("isContact")
|
||||
.setRelation("hasContact")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("coAuthorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
|
||||
.setInverse("isCoAuthorOf")
|
||||
.setRelation("isCoAuthorOf")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("coAuthorship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_dedup_merges", new RelationInverse()
|
||||
.setInverse("isMergedIn")
|
||||
.setRelation("merges")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_dedup_isMergedIn", new RelationInverse()
|
||||
.setInverse("merges")
|
||||
.setRelation("isMergedIn")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||
.setInverse("isSimilarTo")
|
||||
.setRelation("isSimilarTo")
|
||||
.setRelType("personPerson")
|
||||
.setSubReltype("dedupSimilarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"datasourceOrganization_provision_isProvidedBy", new RelationInverse()
|
||||
.setInverse("provides")
|
||||
.setRelation("isProvidedBy")
|
||||
.setRelType("datasourceOrganization")
|
||||
.setSubReltype("provision"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"datasourceOrganization_provision_provides", new RelationInverse()
|
||||
.setInverse("isProvidedBy")
|
||||
.setRelation("provides")
|
||||
.setRelType("datasourceOrganization")
|
||||
.setSubReltype("provision"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
|
||||
.setInverse("isAmongTopNSimilarDocuments")
|
||||
.setRelation("hasAmongTopNSimilarDocuments")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("similarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||
.setInverse("hasAmongTopNSimilarDocuments")
|
||||
.setRelation("isAmongTopNSimilarDocuments")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("similarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_relationship_isRelatedTo", new RelationInverse()
|
||||
.setInverse("isRelatedTo")
|
||||
.setRelation("isRelatedTo")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("relationship"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||
.setInverse("hasAmongTopNSimilarDocuments")
|
||||
.setRelation("isAmongTopNSimilarDocuments")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("similarity"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_supplement_isSupplementTo", new RelationInverse()
|
||||
.setInverse("isSupplementedBy")
|
||||
.setRelation("isSupplementTo")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("supplement"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_supplement_isSupplementedBy", new RelationInverse()
|
||||
.setInverse("isSupplementTo")
|
||||
.setRelation("isSupplementedBy")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("supplement"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_part_isPartOf", new RelationInverse()
|
||||
.setInverse("hasPart")
|
||||
.setRelation("isPartOf")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("part"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_part_hasPart", new RelationInverse()
|
||||
.setInverse("isPartOf")
|
||||
.setRelation("hasPart")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("part"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_dedup_merges", new RelationInverse()
|
||||
.setInverse("isMergedIn")
|
||||
.setRelation("merges")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_dedup_isMergedIn", new RelationInverse()
|
||||
.setInverse("merges")
|
||||
.setRelation("isMergedIn")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("dedup"));
|
||||
relationInverseMap
|
||||
.put(
|
||||
"resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||
.setInverse("isSimilarTo")
|
||||
.setRelation("isSimilarTo")
|
||||
.setRelType("resultResult")
|
||||
.setSubReltype("dedupSimilarity"));
|
||||
|
||||
}
|
||||
|
||||
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
||||
|
||||
private ModelSupport() {
|
||||
|
@ -68,7 +293,7 @@ public class ModelSupport {
|
|||
* @return True if X is a subclass of Y
|
||||
*/
|
||||
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
||||
X subClazzObject, Y superClazzObject) {
|
||||
X subClazzObject, Y superClazzObject) {
|
||||
return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
|
||||
}
|
||||
|
||||
|
@ -82,7 +307,7 @@ public class ModelSupport {
|
|||
* @return True if X is a subclass of Y
|
||||
*/
|
||||
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
||||
X subClazzObject, Class<Y> superClazz) {
|
||||
X subClazzObject, Class<Y> superClazz) {
|
||||
return isSubClass(subClazzObject.getClass(), superClazz);
|
||||
}
|
||||
|
||||
|
@ -96,7 +321,7 @@ public class ModelSupport {
|
|||
* @return True if X is a subclass of Y
|
||||
*/
|
||||
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
||||
Class<X> subClazz, Class<Y> superClazz) {
|
||||
Class<X> subClazz, Class<Y> superClazz) {
|
||||
return superClazz.isAssignableFrom(subClazz);
|
||||
}
|
||||
|
||||
|
@ -108,32 +333,32 @@ public class ModelSupport {
|
|||
*/
|
||||
public static <T extends Oaf> Class<T>[] getOafModelClasses() {
|
||||
return new Class[] {
|
||||
Author.class,
|
||||
Context.class,
|
||||
Country.class,
|
||||
DataInfo.class,
|
||||
Dataset.class,
|
||||
Datasource.class,
|
||||
ExternalReference.class,
|
||||
ExtraInfo.class,
|
||||
Field.class,
|
||||
GeoLocation.class,
|
||||
Instance.class,
|
||||
Journal.class,
|
||||
KeyValue.class,
|
||||
Oaf.class,
|
||||
OafEntity.class,
|
||||
OAIProvenance.class,
|
||||
Organization.class,
|
||||
OriginDescription.class,
|
||||
OtherResearchProduct.class,
|
||||
Project.class,
|
||||
Publication.class,
|
||||
Qualifier.class,
|
||||
Relation.class,
|
||||
Result.class,
|
||||
Software.class,
|
||||
StructuredProperty.class
|
||||
Author.class,
|
||||
Context.class,
|
||||
Country.class,
|
||||
DataInfo.class,
|
||||
Dataset.class,
|
||||
Datasource.class,
|
||||
ExternalReference.class,
|
||||
ExtraInfo.class,
|
||||
Field.class,
|
||||
GeoLocation.class,
|
||||
Instance.class,
|
||||
Journal.class,
|
||||
KeyValue.class,
|
||||
Oaf.class,
|
||||
OafEntity.class,
|
||||
OAIProvenance.class,
|
||||
Organization.class,
|
||||
OriginDescription.class,
|
||||
OtherResearchProduct.class,
|
||||
Project.class,
|
||||
Publication.class,
|
||||
Qualifier.class,
|
||||
Relation.class,
|
||||
Result.class,
|
||||
Software.class,
|
||||
StructuredProperty.class
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -147,10 +372,10 @@ public class ModelSupport {
|
|||
|
||||
public static String getScheme(final String sourceType, final String targetType) {
|
||||
return String
|
||||
.format(
|
||||
schemeTemplate,
|
||||
entityMapping.get(EntityType.valueOf(sourceType)).name(),
|
||||
entityMapping.get(EntityType.valueOf(targetType)).name());
|
||||
.format(
|
||||
schemeTemplate,
|
||||
entityMapping.get(EntityType.valueOf(sourceType)).name(),
|
||||
entityMapping.get(EntityType.valueOf(targetType)).name());
|
||||
}
|
||||
|
||||
public static <T extends Oaf> Function<T, String> idFn() {
|
||||
|
@ -165,42 +390,41 @@ public class ModelSupport {
|
|||
private static <T extends Oaf> String idFnForRelation(T t) {
|
||||
Relation r = (Relation) t;
|
||||
return Optional
|
||||
.ofNullable(r.getSource())
|
||||
.map(
|
||||
source -> Optional
|
||||
.ofNullable(r.getTarget())
|
||||
.map(
|
||||
target -> Optional
|
||||
.ofNullable(r.getRelType())
|
||||
.map(
|
||||
relType -> Optional
|
||||
.ofNullable(r.getSubRelType())
|
||||
.map(
|
||||
subRelType -> Optional
|
||||
.ofNullable(r.getRelClass())
|
||||
.map(
|
||||
relClass -> String
|
||||
.join(
|
||||
source,
|
||||
target,
|
||||
relType,
|
||||
subRelType,
|
||||
relClass))
|
||||
.orElse(
|
||||
String
|
||||
.join(
|
||||
source,
|
||||
target,
|
||||
relType,
|
||||
subRelType)))
|
||||
.orElse(String.join(source, target, relType)))
|
||||
.orElse(String.join(source, target)))
|
||||
.orElse(source))
|
||||
.orElse(null);
|
||||
.ofNullable(r.getSource())
|
||||
.map(
|
||||
source -> Optional
|
||||
.ofNullable(r.getTarget())
|
||||
.map(
|
||||
target -> Optional
|
||||
.ofNullable(r.getRelType())
|
||||
.map(
|
||||
relType -> Optional
|
||||
.ofNullable(r.getSubRelType())
|
||||
.map(
|
||||
subRelType -> Optional
|
||||
.ofNullable(r.getRelClass())
|
||||
.map(
|
||||
relClass -> String
|
||||
.join(
|
||||
source,
|
||||
target,
|
||||
relType,
|
||||
subRelType,
|
||||
relClass))
|
||||
.orElse(
|
||||
String
|
||||
.join(
|
||||
source,
|
||||
target,
|
||||
relType,
|
||||
subRelType)))
|
||||
.orElse(String.join(source, target, relType)))
|
||||
.orElse(String.join(source, target)))
|
||||
.orElse(source))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> String idFnForOafEntity(T t) {
|
||||
return ((OafEntity) t).getId();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -66,19 +66,25 @@ public class PrepareMergedRelationJob {
|
|||
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
|
||||
|
||||
Dataset<Relation> relation = readRelations(spark, inputPath);
|
||||
relation.createOrReplaceTempView("relation");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"Select * from relation " +
|
||||
"where relclass = 'merges' " +
|
||||
"and datainfo.deletedbyinference = false")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.toJSON()
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(outputPath);
|
||||
relation.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gizp")
|
||||
.json(outputPath);
|
||||
// relation.createOrReplaceTempView("relation");
|
||||
//
|
||||
// spark
|
||||
// .sql(
|
||||
// "Select * from relation " +
|
||||
// "where relclass = 'merges' " +
|
||||
// "and datainfo.deletedbyinference = false")
|
||||
// .as(Encoders.bean(Relation.class))
|
||||
// .toJSON()
|
||||
// .write()
|
||||
// .mode(SaveMode.Overwrite)
|
||||
// .option("compression", "gzip")
|
||||
// .text(outputPath);
|
||||
}
|
||||
|
||||
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
||||
|
|
|
@ -53,7 +53,7 @@ public class ReadBlacklistFromDB implements Closeable {
|
|||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
|
||||
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
||||
|
|
|
@ -72,7 +72,7 @@ public class SparkRemoveBlacklistedRelationJob {
|
|||
|
||||
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
||||
String outputPath, String mergesPath) {
|
||||
Dataset<Relation> blackListed = readRelations(spark, blacklistPath);
|
||||
Dataset<Relation> blackListed = readRelations(spark, blacklistPath + "/blacklist");
|
||||
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
||||
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
||||
|
||||
|
|
Loading…
Reference in New Issue