[WebCrawlAffiliation]remove from the creation of the action set the relations for pmc and pmid. Only doi are allowed

This commit is contained in:
Miriam Baglioni 2024-07-16 14:07:37 +02:00 committed by Claudio Atzori
parent 5fc413a5df
commit cbe877b73c
1 changed files with 7 additions and 44 deletions

View File

@ -30,7 +30,6 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner; import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import io.netty.util.Constant;
import scala.Tuple2; import scala.Tuple2;
/** /**
@ -105,8 +104,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
final String ror = ROR_PREFIX final String ror = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror"))); + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror)); ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
// ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
// ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
return ret return ret
.iterator(); .iterator();
@ -146,57 +144,22 @@ public class CreateActionSetFromWebEntries implements Serializable {
"institution.country_code as country_code", "publication_year") "institution.country_code as country_code", "publication_year")
.distinct(); .distinct();
// .selectExpr(
// "id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
// "institution.country_code as country_code", "publication_year")
// .distinct();
} }
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) { private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
return spark return spark
.read() .read()
.option("header", true) .json(inputPath)
.csv(inputPath)
.select("OpenAlexId"); .select("OpenAlexId");
} }
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
if (pmcid == null)
return new ArrayList<>();
return createAffiliatioRelationPair(
PMCID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC", pmcid))),
ror);
}
private static List<Relation> createAffiliationRelationPairPMID(String pmid, String ror) {
if (pmid == null)
return new ArrayList<>();
return createAffiliatioRelationPair(
PMID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
ror);
}
private static String removeResolver(String pidType, String pid) { private static String removeResolver(String pidType, String pid) {
switch (pidType) { if (pidType.equals("DOI")) {
case "PMID": return pid.substring(16);
return pid.substring(33); }
case "PMC": throw new IllegalArgumentException("DOI is the only supported PID type");
return "PMC" + pid.substring(43); }
case "DOI":
return pid.substring(16);
}
throw new RuntimeException();
}
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) { private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
if (doi == null) if (doi == null)