Fix import of affiliations relations from Crossref #335
|
@ -80,16 +80,15 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
// load and parse affiliation relations from HDFS
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:ARRAY<STRING>,`Confidence`:DOUBLE>>")
|
||||
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
||||
.json(inputPath);
|
||||
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
.withColumn("rorid", functions.explode(new Column("matching.RORid")))
|
||||
.select(
|
||||
new Column("DOI").as("doi"),
|
||||
new Column("rorid"),
|
||||
new Column("matching.RORid").as("rorid"),
|
||||
new Column("matching.Confidence").as("confidence"));
|
||||
|
||||
// prepare action sets for affiliation relations
|
||||
|
|
|
@ -31,5 +31,5 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
|
|||
# The following is needed as a property of a workflow
|
||||
oozie.wf.application.path=${oozieTopWfApplicationPath}
|
||||
|
||||
inputPath=/user/schatz/affiliations/data-v3.1.json
|
||||
outputPath=/tmp/crossref-affiliations-output-v3.1
|
||||
inputPath=/data/bip-affiliations/data.json
|
||||
outputPath=/tmp/crossref-affiliations-output-v5
|
||||
|
|
|
@ -101,7 +101,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// );
|
||||
// }
|
||||
// count the number of relations
|
||||
assertEquals(16, tmp.count());
|
||||
assertEquals(20, tmp.count());
|
||||
|
||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
dataset.createOrReplaceTempView("result");
|
||||
|
@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// verify that we have equal number of bi-directional relations
|
||||
Assertions
|
||||
.assertEquals(
|
||||
8, execVerification
|
||||
10, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||
.collectAsList()
|
||||
|
@ -120,14 +120,14 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
8, execVerification
|
||||
10, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||
.collectAsList()
|
||||
.size());
|
||||
|
||||
// check confidence value of a specific relation
|
||||
String sourceDOI = "10.1105/tpc.8.3.343";
|
||||
String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)";
|
||||
|
||||
final String sourceOpenaireId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":["https:\/\/ror.org\/01teme464"],"Confidence":0.73},{"RORid":["https:\/\/ror.org\/03yxnpp24"],"Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":["https:\/\/ror.org\/02k40bc56"],"Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":["https:\/\/ror.org\/00qjgza05"],"Confidence":1}]}
|
||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":["https:\/\/ror.org\/05apxxy63"],"Confidence":1},{"RORid":["https:\/\/ror.org\/035xkbk20"],"Confidence":1}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":["https:\/\/ror.org\/04j198w64"],"Confidence":0.58}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":["https:\/\/ror.org\/057xtrt18"],"Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
|
||||
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
|
||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
Loading…
Reference in New Issue