forked from D-Net/dnet-hadoop
[WebCrawl] addressing comments from PR
This commit is contained in:
parent
eb4692e4ee
commit
7de114bda0
|
@ -77,64 +77,11 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
||||||
createActionSet(spark, inputPath, outputPath + "actionSet");
|
createActionSet(spark, inputPath, outputPath );
|
||||||
createPlainRelations(spark, inputPath, outputPath + "relations");
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void createPlainRelations(SparkSession spark, String inputPath, String outputPath) {
|
|
||||||
final Dataset<Row> dataset = readWebCrawl(spark, inputPath);
|
|
||||||
|
|
||||||
dataset.flatMap((FlatMapFunction<Row, Tuple2<String, Relation>>) row -> {
|
|
||||||
List<Tuple2<String, Relation>> ret = new ArrayList<>();
|
|
||||||
|
|
||||||
final String ror = row.getAs("ror");
|
|
||||||
ret.addAll(createAffiliationRelationPairDOI(row.getAs("publication_year"), row.getAs("doi"), ror));
|
|
||||||
ret.addAll(createAffiliationRelationPairPMID(row.getAs("publication_year"), row.getAs("pmid"), ror));
|
|
||||||
ret.addAll(createAffiliationRelationPairPMCID(row.getAs("publication_year"), row.getAs("pmcid"), ror));
|
|
||||||
|
|
||||||
return ret
|
|
||||||
.iterator();
|
|
||||||
}, Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class)))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(outputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairPMCID(
|
|
||||||
String publication_year, String pmcid, String ror) {
|
|
||||||
if (pmcid == null)
|
|
||||||
return new ArrayList<>();
|
|
||||||
|
|
||||||
return createAffiliatioRelationPair("PMC" + pmcid, ror)
|
|
||||||
.stream()
|
|
||||||
.map(r -> new Tuple2<String, Relation>(publication_year, r))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairPMID(
|
|
||||||
String publication_year, String pmid, String ror) {
|
|
||||||
if (pmid == null)
|
|
||||||
return new ArrayList<>();
|
|
||||||
|
|
||||||
return createAffiliatioRelationPair(pmid, ror)
|
|
||||||
.stream()
|
|
||||||
.map(r -> new Tuple2<String, Relation>(publication_year, r))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Collection<? extends Tuple2<String, Relation>> createAffiliationRelationPairDOI(
|
|
||||||
String publication_year, String doi, String ror) {
|
|
||||||
if (doi == null)
|
|
||||||
return new ArrayList<>();
|
|
||||||
|
|
||||||
return createAffiliatioRelationPair(doi, ror)
|
|
||||||
.stream()
|
|
||||||
.map(r -> new Tuple2<String, Relation>(publication_year, r))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void createActionSet(SparkSession spark, String inputPath,
|
public static void createActionSet(SparkSession spark, String inputPath,
|
||||||
String outputPath) {
|
String outputPath) {
|
||||||
|
|
||||||
|
@ -185,7 +132,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
.selectExpr(
|
.selectExpr(
|
||||||
"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
|
"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
|
||||||
"institution.country_code as country_code", "publication_year")
|
"institution.country_code as country_code", "publication_year")
|
||||||
// .where("country_code == 'IE'")
|
|
||||||
.distinct();
|
.distinct();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -197,7 +143,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
return createAffiliatioRelationPair(
|
return createAffiliatioRelationPair(
|
||||||
PMCID_PREFIX
|
PMCID_PREFIX
|
||||||
+ IdentifierFactory
|
+ IdentifierFactory
|
||||||
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), "PMC" + pmcid.substring(43))),
|
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC" , pmcid))),
|
||||||
ror);
|
ror);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -208,10 +154,24 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
return createAffiliatioRelationPair(
|
return createAffiliatioRelationPair(
|
||||||
PMID_PREFIX
|
PMID_PREFIX
|
||||||
+ IdentifierFactory
|
+ IdentifierFactory
|
||||||
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), pmid.substring(33))),
|
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
|
||||||
ror);
|
ror);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String removeResolver(String pidType, String pid) {
|
||||||
|
switch (pidType){
|
||||||
|
case "PMID":
|
||||||
|
return pid.substring(33);
|
||||||
|
case "PMC":
|
||||||
|
return "PMC" + pid.substring(43);
|
||||||
|
case "DOI":
|
||||||
|
return pid.substring(16);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new RuntimeException();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
|
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
|
||||||
if (doi == null)
|
if (doi == null)
|
||||||
return new ArrayList<>();
|
return new ArrayList<>();
|
||||||
|
@ -219,7 +179,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
return createAffiliatioRelationPair(
|
return createAffiliatioRelationPair(
|
||||||
DOI_PREFIX
|
DOI_PREFIX
|
||||||
+ IdentifierFactory
|
+ IdentifierFactory
|
||||||
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), doi.substring(16))),
|
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI" ,doi))),
|
||||||
ror);
|
ror);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -25,6 +24,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author miriam.baglioni
|
* @author miriam.baglioni
|
||||||
|
@ -67,6 +69,7 @@ public class CreateASTest {
|
||||||
FileUtils.deleteDirectory(workingDir.toFile());
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
spark.stop();
|
spark.stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testNumberofRelations() throws Exception {
|
void testNumberofRelations() throws Exception {
|
||||||
|
|
||||||
|
@ -96,6 +99,7 @@ public class CreateASTest {
|
||||||
Assertions.assertEquals(64, tmp.count());
|
Assertions.assertEquals(64, tmp.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testRelations() throws Exception {
|
void testRelations() throws Exception {
|
||||||
|
|
||||||
|
@ -280,6 +284,4 @@ public class CreateASTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue