forked from D-Net/dnet-hadoop
This commit is contained in:
parent
4c94231cad
commit
28556507e7
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.common;
|
package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -13,7 +12,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
public class ModelSupport {
|
public class ModelSupport {
|
||||||
|
|
||||||
/** Defines the mapping between the actual entity type and the main entity type */
|
/** Defines the mapping between the actual entity type and the main entity type */
|
||||||
private static final Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
entityMapping.put(EntityType.publication, MainEntityType.result);
|
entityMapping.put(EntityType.publication, MainEntityType.result);
|
||||||
|
@ -53,6 +52,232 @@ public class ModelSupport {
|
||||||
oafTypes.put("relation", Relation.class);
|
oafTypes.put("relation", Relation.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
entityIdPrefix.put("datasource", "10");
|
||||||
|
entityIdPrefix.put("organization", "20");
|
||||||
|
entityIdPrefix.put("project", "40");
|
||||||
|
entityIdPrefix.put("result", "50");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final Map<String, RelationInverse> relationInverseMap = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personResult_authorship_isAuthorOf", new RelationInverse()
|
||||||
|
.setRelation("isAuthorOf")
|
||||||
|
.setInverse("hasAuthor")
|
||||||
|
.setRelType("personResult")
|
||||||
|
.setSubReltype("authorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personResult_authorship_hasAuthor", new RelationInverse()
|
||||||
|
.setInverse("isAuthorOf")
|
||||||
|
.setRelation("hasAuthor")
|
||||||
|
.setRelType("personResult")
|
||||||
|
.setSubReltype("authorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectOrganization_participation_isParticipant", new RelationInverse()
|
||||||
|
.setRelation("isParticipant")
|
||||||
|
.setInverse("hasParticipant")
|
||||||
|
.setRelType("projectOrganization")
|
||||||
|
.setSubReltype("participation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectOrganization_participation_hasParticipant", new RelationInverse()
|
||||||
|
.setInverse("isParticipant")
|
||||||
|
.setRelation("hasParticipant")
|
||||||
|
.setRelType("projectOrganization")
|
||||||
|
.setSubReltype("participation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse()
|
||||||
|
.setRelation("hasAuthorInstitution")
|
||||||
|
.setInverse("isAuthorInstitutionOf")
|
||||||
|
.setRelType("resultOrganization")
|
||||||
|
.setSubReltype("affiliation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse()
|
||||||
|
.setInverse("hasAuthorInstitution")
|
||||||
|
.setRelation("isAuthorInstitutionOf")
|
||||||
|
.setRelType("resultOrganization")
|
||||||
|
.setSubReltype("affiliation"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"organizationOrganization_dedup_merges", new RelationInverse()
|
||||||
|
.setRelation("merges")
|
||||||
|
.setInverse("isMergedIn")
|
||||||
|
.setRelType("organizationOrganization")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"organizationOrganization_dedup_isMergedIn", new RelationInverse()
|
||||||
|
.setInverse("merges")
|
||||||
|
.setRelation("isMergedIn")
|
||||||
|
.setRelType("organizationOrganization")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||||
|
.setInverse("isSimilarTo")
|
||||||
|
.setRelation("isSimilarTo")
|
||||||
|
.setRelType("organizationOrganization")
|
||||||
|
.setSubReltype("dedupSimilarity"));
|
||||||
|
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultProject_outcome_isProducedBy", new RelationInverse()
|
||||||
|
.setRelation("isProducedBy")
|
||||||
|
.setInverse("produces")
|
||||||
|
.setRelType("resultProject")
|
||||||
|
.setSubReltype("outcome"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultProject_outcome_produces", new RelationInverse()
|
||||||
|
.setInverse("isProducedBy")
|
||||||
|
.setRelation("produces")
|
||||||
|
.setRelType("resultProject")
|
||||||
|
.setSubReltype("outcome"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectPerson_contactPerson_isContact", new RelationInverse()
|
||||||
|
.setRelation("isContact")
|
||||||
|
.setInverse("hasContact")
|
||||||
|
.setRelType("projectPerson")
|
||||||
|
.setSubReltype("contactPerson"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"projectPerson_contactPerson_hasContact", new RelationInverse()
|
||||||
|
.setInverse("isContact")
|
||||||
|
.setRelation("hasContact")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("coAuthorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_coAuthorship_isCoauthorOf", new RelationInverse()
|
||||||
|
.setInverse("isCoAuthorOf")
|
||||||
|
.setRelation("isCoAuthorOf")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("coAuthorship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_dedup_merges", new RelationInverse()
|
||||||
|
.setInverse("isMergedIn")
|
||||||
|
.setRelation("merges")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_dedup_isMergedIn", new RelationInverse()
|
||||||
|
.setInverse("merges")
|
||||||
|
.setRelation("isMergedIn")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"personPerson_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||||
|
.setInverse("isSimilarTo")
|
||||||
|
.setRelation("isSimilarTo")
|
||||||
|
.setRelType("personPerson")
|
||||||
|
.setSubReltype("dedupSimilarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"datasourceOrganization_provision_isProvidedBy", new RelationInverse()
|
||||||
|
.setInverse("provides")
|
||||||
|
.setRelation("isProvidedBy")
|
||||||
|
.setRelType("datasourceOrganization")
|
||||||
|
.setSubReltype("provision"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"datasourceOrganization_provision_provides", new RelationInverse()
|
||||||
|
.setInverse("isProvidedBy")
|
||||||
|
.setRelation("provides")
|
||||||
|
.setRelType("datasourceOrganization")
|
||||||
|
.setSubReltype("provision"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse()
|
||||||
|
.setInverse("isAmongTopNSimilarDocuments")
|
||||||
|
.setRelation("hasAmongTopNSimilarDocuments")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("similarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||||
|
.setInverse("hasAmongTopNSimilarDocuments")
|
||||||
|
.setRelation("isAmongTopNSimilarDocuments")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("similarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_relationship_isRelatedTo", new RelationInverse()
|
||||||
|
.setInverse("isRelatedTo")
|
||||||
|
.setRelation("isRelatedTo")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("relationship"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse()
|
||||||
|
.setInverse("hasAmongTopNSimilarDocuments")
|
||||||
|
.setRelation("isAmongTopNSimilarDocuments")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("similarity"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_supplement_isSupplementTo", new RelationInverse()
|
||||||
|
.setInverse("isSupplementedBy")
|
||||||
|
.setRelation("isSupplementTo")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("supplement"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_supplement_isSupplementedBy", new RelationInverse()
|
||||||
|
.setInverse("isSupplementTo")
|
||||||
|
.setRelation("isSupplementedBy")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("supplement"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_part_isPartOf", new RelationInverse()
|
||||||
|
.setInverse("hasPart")
|
||||||
|
.setRelation("isPartOf")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("part"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_part_hasPart", new RelationInverse()
|
||||||
|
.setInverse("isPartOf")
|
||||||
|
.setRelation("hasPart")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("part"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_dedup_merges", new RelationInverse()
|
||||||
|
.setInverse("isMergedIn")
|
||||||
|
.setRelation("merges")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_dedup_isMergedIn", new RelationInverse()
|
||||||
|
.setInverse("merges")
|
||||||
|
.setRelation("isMergedIn")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("dedup"));
|
||||||
|
relationInverseMap
|
||||||
|
.put(
|
||||||
|
"resultResult_dedupSimilarity_isSimilarTo", new RelationInverse()
|
||||||
|
.setInverse("isSimilarTo")
|
||||||
|
.setRelation("isSimilarTo")
|
||||||
|
.setRelType("resultResult")
|
||||||
|
.setSubReltype("dedupSimilarity"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
||||||
|
|
||||||
private ModelSupport() {
|
private ModelSupport() {
|
||||||
|
@ -68,7 +293,7 @@ public class ModelSupport {
|
||||||
* @return True if X is a subclass of Y
|
* @return True if X is a subclass of Y
|
||||||
*/
|
*/
|
||||||
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
||||||
X subClazzObject, Y superClazzObject) {
|
X subClazzObject, Y superClazzObject) {
|
||||||
return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
|
return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,7 +307,7 @@ public class ModelSupport {
|
||||||
* @return True if X is a subclass of Y
|
* @return True if X is a subclass of Y
|
||||||
*/
|
*/
|
||||||
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
||||||
X subClazzObject, Class<Y> superClazz) {
|
X subClazzObject, Class<Y> superClazz) {
|
||||||
return isSubClass(subClazzObject.getClass(), superClazz);
|
return isSubClass(subClazzObject.getClass(), superClazz);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,7 +321,7 @@ public class ModelSupport {
|
||||||
* @return True if X is a subclass of Y
|
* @return True if X is a subclass of Y
|
||||||
*/
|
*/
|
||||||
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(
|
||||||
Class<X> subClazz, Class<Y> superClazz) {
|
Class<X> subClazz, Class<Y> superClazz) {
|
||||||
return superClazz.isAssignableFrom(subClazz);
|
return superClazz.isAssignableFrom(subClazz);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,32 +333,32 @@ public class ModelSupport {
|
||||||
*/
|
*/
|
||||||
public static <T extends Oaf> Class<T>[] getOafModelClasses() {
|
public static <T extends Oaf> Class<T>[] getOafModelClasses() {
|
||||||
return new Class[] {
|
return new Class[] {
|
||||||
Author.class,
|
Author.class,
|
||||||
Context.class,
|
Context.class,
|
||||||
Country.class,
|
Country.class,
|
||||||
DataInfo.class,
|
DataInfo.class,
|
||||||
Dataset.class,
|
Dataset.class,
|
||||||
Datasource.class,
|
Datasource.class,
|
||||||
ExternalReference.class,
|
ExternalReference.class,
|
||||||
ExtraInfo.class,
|
ExtraInfo.class,
|
||||||
Field.class,
|
Field.class,
|
||||||
GeoLocation.class,
|
GeoLocation.class,
|
||||||
Instance.class,
|
Instance.class,
|
||||||
Journal.class,
|
Journal.class,
|
||||||
KeyValue.class,
|
KeyValue.class,
|
||||||
Oaf.class,
|
Oaf.class,
|
||||||
OafEntity.class,
|
OafEntity.class,
|
||||||
OAIProvenance.class,
|
OAIProvenance.class,
|
||||||
Organization.class,
|
Organization.class,
|
||||||
OriginDescription.class,
|
OriginDescription.class,
|
||||||
OtherResearchProduct.class,
|
OtherResearchProduct.class,
|
||||||
Project.class,
|
Project.class,
|
||||||
Publication.class,
|
Publication.class,
|
||||||
Qualifier.class,
|
Qualifier.class,
|
||||||
Relation.class,
|
Relation.class,
|
||||||
Result.class,
|
Result.class,
|
||||||
Software.class,
|
Software.class,
|
||||||
StructuredProperty.class
|
StructuredProperty.class
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -147,10 +372,10 @@ public class ModelSupport {
|
||||||
|
|
||||||
public static String getScheme(final String sourceType, final String targetType) {
|
public static String getScheme(final String sourceType, final String targetType) {
|
||||||
return String
|
return String
|
||||||
.format(
|
.format(
|
||||||
schemeTemplate,
|
schemeTemplate,
|
||||||
entityMapping.get(EntityType.valueOf(sourceType)).name(),
|
entityMapping.get(EntityType.valueOf(sourceType)).name(),
|
||||||
entityMapping.get(EntityType.valueOf(targetType)).name());
|
entityMapping.get(EntityType.valueOf(targetType)).name());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> Function<T, String> idFn() {
|
public static <T extends Oaf> Function<T, String> idFn() {
|
||||||
|
@ -165,42 +390,41 @@ public class ModelSupport {
|
||||||
private static <T extends Oaf> String idFnForRelation(T t) {
|
private static <T extends Oaf> String idFnForRelation(T t) {
|
||||||
Relation r = (Relation) t;
|
Relation r = (Relation) t;
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(r.getSource())
|
.ofNullable(r.getSource())
|
||||||
.map(
|
.map(
|
||||||
source -> Optional
|
source -> Optional
|
||||||
.ofNullable(r.getTarget())
|
.ofNullable(r.getTarget())
|
||||||
.map(
|
.map(
|
||||||
target -> Optional
|
target -> Optional
|
||||||
.ofNullable(r.getRelType())
|
.ofNullable(r.getRelType())
|
||||||
.map(
|
.map(
|
||||||
relType -> Optional
|
relType -> Optional
|
||||||
.ofNullable(r.getSubRelType())
|
.ofNullable(r.getSubRelType())
|
||||||
.map(
|
.map(
|
||||||
subRelType -> Optional
|
subRelType -> Optional
|
||||||
.ofNullable(r.getRelClass())
|
.ofNullable(r.getRelClass())
|
||||||
.map(
|
.map(
|
||||||
relClass -> String
|
relClass -> String
|
||||||
.join(
|
.join(
|
||||||
source,
|
source,
|
||||||
target,
|
target,
|
||||||
relType,
|
relType,
|
||||||
subRelType,
|
subRelType,
|
||||||
relClass))
|
relClass))
|
||||||
.orElse(
|
.orElse(
|
||||||
String
|
String
|
||||||
.join(
|
.join(
|
||||||
source,
|
source,
|
||||||
target,
|
target,
|
||||||
relType,
|
relType,
|
||||||
subRelType)))
|
subRelType)))
|
||||||
.orElse(String.join(source, target, relType)))
|
.orElse(String.join(source, target, relType)))
|
||||||
.orElse(String.join(source, target)))
|
.orElse(String.join(source, target)))
|
||||||
.orElse(source))
|
.orElse(source))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends Oaf> String idFnForOafEntity(T t) {
|
private static <T extends Oaf> String idFnForOafEntity(T t) {
|
||||||
return ((OafEntity) t).getId();
|
return ((OafEntity) t).getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,19 +66,25 @@ public class PrepareMergedRelationJob {
|
||||||
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
|
private static void selectMergesRelations(SparkSession spark, String inputPath, String outputPath) {
|
||||||
|
|
||||||
Dataset<Relation> relation = readRelations(spark, inputPath);
|
Dataset<Relation> relation = readRelations(spark, inputPath);
|
||||||
relation.createOrReplaceTempView("relation");
|
|
||||||
|
|
||||||
spark
|
relation.filter("relclass = 'merges' and datainfo.deletedbyinference=false")
|
||||||
.sql(
|
.write()
|
||||||
"Select * from relation " +
|
.mode(SaveMode.Overwrite)
|
||||||
"where relclass = 'merges' " +
|
.option("compression","gizp")
|
||||||
"and datainfo.deletedbyinference = false")
|
.json(outputPath);
|
||||||
.as(Encoders.bean(Relation.class))
|
// relation.createOrReplaceTempView("relation");
|
||||||
.toJSON()
|
//
|
||||||
.write()
|
// spark
|
||||||
.mode(SaveMode.Overwrite)
|
// .sql(
|
||||||
.option("compression", "gzip")
|
// "Select * from relation " +
|
||||||
.text(outputPath);
|
// "where relclass = 'merges' " +
|
||||||
|
// "and datainfo.deletedbyinference = false")
|
||||||
|
// .as(Encoders.bean(Relation.class))
|
||||||
|
// .toJSON()
|
||||||
|
// .write()
|
||||||
|
// .mode(SaveMode.Overwrite)
|
||||||
|
// .option("compression", "gzip")
|
||||||
|
// .text(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
public static org.apache.spark.sql.Dataset<Relation> readRelations(
|
||||||
|
|
|
@ -53,7 +53,7 @@ public class ReadBlacklistFromDB implements Closeable {
|
||||||
final String dbUrl = parser.get("postgresUrl");
|
final String dbUrl = parser.get("postgresUrl");
|
||||||
final String dbUser = parser.get("postgresUser");
|
final String dbUser = parser.get("postgresUser");
|
||||||
final String dbPassword = parser.get("postgresPassword");
|
final String dbPassword = parser.get("postgresPassword");
|
||||||
final String hdfsPath = parser.get("hdfsPath");
|
final String hdfsPath = parser.get("hdfsPath") + "/blacklist";
|
||||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||||
|
|
||||||
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
try (final ReadBlacklistFromDB rbl = new ReadBlacklistFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
|
||||||
|
|
|
@ -72,7 +72,7 @@ public class SparkRemoveBlacklistedRelationJob {
|
||||||
|
|
||||||
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
private static void removeBlacklistedRelations(SparkSession spark, String blacklistPath, String inputPath,
|
||||||
String outputPath, String mergesPath) {
|
String outputPath, String mergesPath) {
|
||||||
Dataset<Relation> blackListed = readRelations(spark, blacklistPath);
|
Dataset<Relation> blackListed = readRelations(spark, blacklistPath + "/blacklist");
|
||||||
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
Dataset<Relation> inputRelation = readRelations(spark, inputPath);
|
||||||
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
Dataset<Relation> mergesRelation = readRelations(spark, mergesPath);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue