[AffRo] refactoring

This commit is contained in:
Miriam Baglioni 2024-09-25 17:12:51 +02:00
parent a54d021c37
commit 7cd8171268
3 changed files with 25 additions and 18 deletions

View File

@ -143,11 +143,11 @@ public class PrepareAffiliationRelations implements Serializable {
Dataset<Row> df = spark
.read()
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.schema(
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
}
@ -158,11 +158,11 @@ public class PrepareAffiliationRelations implements Serializable {
// load and parse affiliation relations from HDFS
Dataset<Row> df = spark
.read()
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.schema(
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df);
}
@ -175,9 +175,8 @@ public class PrepareAffiliationRelations implements Serializable {
new Column("matching.PID").as("pidtype"),
new Column("matching.Value").as("pidvalue"),
new Column("matching.Confidence").as("confidence"),
new Column("matching.Status").as("status"))
.where("status = 'active'");
new Column("matching.Status").as("status"))
.where("status = 'active'");
// prepare action sets for affiliation relations
return df
@ -188,14 +187,13 @@ public class PrepareAffiliationRelations implements Serializable {
final String paperId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
// Organization to OpenAIRE identifier
String affId = null;
if(row.getAs("pidtype").equals("ROR"))
//ROR id to OpenIARE id
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
if (row.getAs("pidtype").equals("ROR"))
// ROR id to OpenIARE id
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
else
//getting the OpenOrgs identifier for the organization
// getting the OpenOrgs identifier for the organization
affId = row.getAs("pidvalue");
Qualifier qualifier = OafMapperUtils

View File

@ -517,8 +517,10 @@ case object Crossref2Oaf {
)
}
if(doi.startsWith("10.3410") || doi.startsWith("10.12703"))
instance.setHostedby(OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true),"H1Connect"))
if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
instance.setHostedby(
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
)
instance.setAccessright(
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)

View File

@ -110,12 +110,11 @@ public class PrepareAffiliationRelationsTest {
// );
// }
// count the number of relations
assertEquals(168, tmp.count());//150 +
assertEquals(168, tmp.count());// 150 +
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result");
Dataset<Row> execVerification = spark
.sql("select r.relType, r.relClass, r.source, r.target, r.dataInfo.trust from result r");
@ -159,7 +158,15 @@ public class PrepareAffiliationRelationsTest {
.assertEquals(
5, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
Assertions.assertEquals(1,execVerification.filter("source = '" + ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s00217-010-1268-9")) + "' and target = '" + "20|ror_________::" + IdentifierFactory.md5("https://ror.org/03265fv13") + "'").count());
Assertions
.assertEquals(
1, execVerification
.filter(
"source = '" + ID_PREFIX
+ IdentifierFactory
.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s00217-010-1268-9"))
+ "' and target = '" + "20|ror_________::"
+ IdentifierFactory.md5("https://ror.org/03265fv13") + "'")
.count());
}
}