[WebCrawl] addressing comments from PR

This commit is contained in:
Miriam Baglioni 2024-04-22 13:52:50 +02:00
parent eb4692e4ee
commit 7de114bda0
2 changed files with 376 additions and 414 deletions

View File

@ -77,64 +77,11 @@ public class CreateActionSetFromWebEntries implements Serializable {
isSparkSessionManaged,
spark -> {
createActionSet(spark, inputPath, outputPath + "actionSet");
createPlainRelations(spark, inputPath, outputPath + "relations");
createActionSet(spark, inputPath, outputPath );
});
}
private static void createPlainRelations(SparkSession spark, String inputPath, String outputPath) {
final Dataset<Row> dataset = readWebCrawl(spark, inputPath);
dataset.flatMap((FlatMapFunction<Row, Tuple2<String, Relation>>) row -> {
List<Tuple2<String, Relation>> ret = new ArrayList<>();
final String ror = row.getAs("ror");
ret.addAll(createAffiliationRelationPairDOI(row.getAs("publication_year"), row.getAs("doi"), ror));
ret.addAll(createAffiliationRelationPairPMID(row.getAs("publication_year"), row.getAs("pmid"), ror));
ret.addAll(createAffiliationRelationPairPMCID(row.getAs("publication_year"), row.getAs("pmcid"), ror));
return ret
.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairPMCID(
String publication_year, String pmcid, String ror) {
if (pmcid == null)
return new ArrayList<>();
return createAffiliatioRelationPair("PMC" + pmcid, ror)
.stream()
.map(r -> new Tuple2<String, Relation>(publication_year, r))
.collect(Collectors.toList());
}
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairPMID(
String publication_year, String pmid, String ror) {
if (pmid == null)
return new ArrayList<>();
return createAffiliatioRelationPair(pmid, ror)
.stream()
.map(r -> new Tuple2<String, Relation>(publication_year, r))
.collect(Collectors.toList());
}
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairDOI(
String publication_year, String doi, String ror) {
if (doi == null)
return new ArrayList<>();
return createAffiliatioRelationPair(doi, ror)
.stream()
.map(r -> new Tuple2<String, Relation>(publication_year, r))
.collect(Collectors.toList());
}
public static void createActionSet(SparkSession spark, String inputPath,
String outputPath) {
@ -185,7 +132,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
.selectExpr(
"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
"institution.country_code as country_code", "publication_year")
// .where("country_code == 'IE'")
.distinct();
}
@ -197,7 +143,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
return createAffiliatioRelationPair(
PMCID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), "PMC" + pmcid.substring(43))),
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC" , pmcid))),
ror);
}
@ -208,10 +154,24 @@ public class CreateActionSetFromWebEntries implements Serializable {
return createAffiliatioRelationPair(
PMID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), pmid.substring(33))),
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
ror);
}
private static String removeResolver(String pidType, String pid) {
switch (pidType){
case "PMID":
return pid.substring(33);
case "PMC":
return "PMC" + pid.substring(43);
case "DOI":
return pid.substring(16);
}
throw new RuntimeException();
}
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
if (doi == null)
return new ArrayList<>();
@ -219,7 +179,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
return createAffiliatioRelationPair(
DOI_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), doi.substring(16))),
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI" ,doi))),
ror);
}

View File

@ -1,13 +1,12 @@
package eu.dnetlib.dhp.actionmanager.webcrawl;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
@ -25,6 +24,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
/**
* @author miriam.baglioni
@ -67,6 +69,7 @@ public class CreateASTest {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testNumberofRelations() throws Exception {
@ -96,6 +99,7 @@ public class CreateASTest {
Assertions.assertEquals(64, tmp.count());
}
@Test
void testRelations() throws Exception {
@ -280,6 +284,4 @@ public class CreateASTest {
}
}