[WebCrawl]remove relations for pid not doi

This commit is contained in:
Miriam Baglioni 2024-06-11 17:03:59 +02:00
parent 3efd5b1308
commit 143c525343
1 changed files with 9 additions and 3 deletions

View File

@ -104,8 +104,8 @@ public class CreateActionSetFromWebEntries implements Serializable {
final String ror = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
// ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
// ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
return ret
.iterator();
@ -139,11 +139,17 @@ public class CreateActionSetFromWebEntries implements Serializable {
"institution", functions
.explode(
functions.col("institutions")))
.selectExpr(
"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
"id", "doi", "institution.ror as ror",
"institution.country_code as country_code", "publication_year")
.distinct();
// .selectExpr(
// "id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
// "institution.country_code as country_code", "publication_year")
// .distinct();
}
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {