[WebCrawl] addressing comments from PR
This commit is contained in:
parent
eb4692e4ee
commit
7de114bda0
|
@ -77,64 +77,11 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
||||
createActionSet(spark, inputPath, outputPath + "actionSet");
|
||||
createPlainRelations(spark, inputPath, outputPath + "relations");
|
||||
createActionSet(spark, inputPath, outputPath );
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private static void createPlainRelations(SparkSession spark, String inputPath, String outputPath) {
|
||||
final Dataset<Row> dataset = readWebCrawl(spark, inputPath);
|
||||
|
||||
dataset.flatMap((FlatMapFunction<Row, Tuple2<String, Relation>>) row -> {
|
||||
List<Tuple2<String, Relation>> ret = new ArrayList<>();
|
||||
|
||||
final String ror = row.getAs("ror");
|
||||
ret.addAll(createAffiliationRelationPairDOI(row.getAs("publication_year"), row.getAs("doi"), ror));
|
||||
ret.addAll(createAffiliationRelationPairPMID(row.getAs("publication_year"), row.getAs("pmid"), ror));
|
||||
ret.addAll(createAffiliationRelationPairPMCID(row.getAs("publication_year"), row.getAs("pmcid"), ror));
|
||||
|
||||
return ret
|
||||
.iterator();
|
||||
}, Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairPMCID(
|
||||
String publication_year, String pmcid, String ror) {
|
||||
if (pmcid == null)
|
||||
return new ArrayList<>();
|
||||
|
||||
return createAffiliatioRelationPair("PMC" + pmcid, ror)
|
||||
.stream()
|
||||
.map(r -> new Tuple2<String, Relation>(publication_year, r))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairPMID(
|
||||
String publication_year, String pmid, String ror) {
|
||||
if (pmid == null)
|
||||
return new ArrayList<>();
|
||||
|
||||
return createAffiliatioRelationPair(pmid, ror)
|
||||
.stream()
|
||||
.map(r -> new Tuple2<String, Relation>(publication_year, r))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairDOI(
|
||||
String publication_year, String doi, String ror) {
|
||||
if (doi == null)
|
||||
return new ArrayList<>();
|
||||
|
||||
return createAffiliatioRelationPair(doi, ror)
|
||||
.stream()
|
||||
.map(r -> new Tuple2<String, Relation>(publication_year, r))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static void createActionSet(SparkSession spark, String inputPath,
|
||||
String outputPath) {
|
||||
|
||||
|
@ -185,7 +132,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
.selectExpr(
|
||||
"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
|
||||
"institution.country_code as country_code", "publication_year")
|
||||
// .where("country_code == 'IE'")
|
||||
.distinct();
|
||||
|
||||
}
|
||||
|
@ -197,7 +143,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
return createAffiliatioRelationPair(
|
||||
PMCID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), "PMC" + pmcid.substring(43))),
|
||||
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC" , pmcid))),
|
||||
ror);
|
||||
}
|
||||
|
||||
|
@ -208,10 +154,24 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
return createAffiliatioRelationPair(
|
||||
PMID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), pmid.substring(33))),
|
||||
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
|
||||
ror);
|
||||
}
|
||||
|
||||
private static String removeResolver(String pidType, String pid) {
|
||||
switch (pidType){
|
||||
case "PMID":
|
||||
return pid.substring(33);
|
||||
case "PMC":
|
||||
return "PMC" + pid.substring(43);
|
||||
case "DOI":
|
||||
return pid.substring(16);
|
||||
}
|
||||
|
||||
throw new RuntimeException();
|
||||
|
||||
}
|
||||
|
||||
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
|
||||
if (doi == null)
|
||||
return new ArrayList<>();
|
||||
|
@ -219,7 +179,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
return createAffiliatioRelationPair(
|
||||
DOI_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), doi.substring(16))),
|
||||
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI" ,doi))),
|
||||
ror);
|
||||
|
||||
}
|
||||
|
|
|
@ -1,13 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -25,6 +24,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
|
@ -67,6 +69,7 @@ public class CreateASTest {
|
|||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNumberofRelations() throws Exception {
|
||||
|
||||
|
@ -96,6 +99,7 @@ public class CreateASTest {
|
|||
Assertions.assertEquals(64, tmp.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRelations() throws Exception {
|
||||
|
||||
|
@ -280,6 +284,4 @@ public class CreateASTest {
|
|||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue