[AffiliationFromPublisher]refactoring after compilation
This commit is contained in:
parent
907eeadce8
commit
42531afc3e
|
@ -89,64 +89,66 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
Constants.removeOutputDir(spark, outputPath);
|
Constants.removeOutputDir(spark, outputPath);
|
||||||
createActionSet(spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath, publisherInputPath, outputPath);
|
createActionSet(
|
||||||
|
spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
|
||||||
|
publisherInputPath, outputPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath, String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath, String outputPath) {
|
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
|
||||||
|
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
|
||||||
|
String outputPath) {
|
||||||
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
||||||
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
||||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
||||||
spark, crossrefInputPath, collectedFromCrossref);
|
spark, crossrefInputPath, collectedFromCrossref);
|
||||||
|
|
||||||
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
||||||
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
||||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||||
spark, pubmedInputPath, collectedFromPubmed);
|
spark, pubmedInputPath, collectedFromPubmed);
|
||||||
|
|
||||||
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
||||||
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
||||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
||||||
spark, openapcInputPath, collectedFromOpenAPC);
|
spark, openapcInputPath, collectedFromOpenAPC);
|
||||||
|
|
||||||
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
||||||
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
||||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||||
spark, dataciteInputPath, collectedFromDatacite);
|
spark, dataciteInputPath, collectedFromDatacite);
|
||||||
|
|
||||||
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||||
spark, webcrawlInputPath, collectedFromWebCrawl);
|
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||||
|
|
||||||
List<KeyValue> collectedfromPublisher = OafMapperUtils
|
List<KeyValue> collectedfromPublisher = OafMapperUtils
|
||||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
||||||
spark, publisherlInputPath, collectedfromPublisher);
|
spark, publisherlInputPath, collectedfromPublisher);
|
||||||
|
|
||||||
crossrefRelations
|
crossrefRelations
|
||||||
.union(pubmedRelations)
|
.union(pubmedRelations)
|
||||||
.union(openAPCRelations)
|
.union(openAPCRelations)
|
||||||
.union(dataciteRelations)
|
.union(dataciteRelations)
|
||||||
.union(webCrawlRelations)
|
.union(webCrawlRelations)
|
||||||
.union(publisherRelations)
|
.union(publisherRelations)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static JavaPairRDD<Text,Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
||||||
List<KeyValue> collectedfrom){
|
List<KeyValue> collectedfrom) {
|
||||||
|
|
||||||
|
|
||||||
Dataset<Row> df = spark
|
Dataset<Row> df = spark
|
||||||
.read()
|
.read()
|
||||||
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
||||||
.json(inputPath)
|
.json(inputPath)
|
||||||
.where("DOI is not null");
|
.where("DOI is not null");
|
||||||
|
|
||||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
||||||
|
@ -174,41 +176,41 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
|
|
||||||
// prepare action sets for affiliation relations
|
// prepare action sets for affiliation relations
|
||||||
return df
|
return df
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||||
|
|
||||||
// DOI to OpenAIRE id
|
// DOI to OpenAIRE id
|
||||||
final String paperId = ID_PREFIX
|
final String paperId = ID_PREFIX
|
||||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
||||||
|
|
||||||
// ROR id to OpenAIRE id
|
// ROR id to OpenAIRE id
|
||||||
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
||||||
|
|
||||||
Qualifier qualifier = OafMapperUtils
|
Qualifier qualifier = OafMapperUtils
|
||||||
.qualifier(
|
.qualifier(
|
||||||
BIP_AFFILIATIONS_CLASSID,
|
BIP_AFFILIATIONS_CLASSID,
|
||||||
BIP_AFFILIATIONS_CLASSNAME,
|
BIP_AFFILIATIONS_CLASSNAME,
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
||||||
|
|
||||||
// format data info; setting `confidence` into relation's `trust`
|
// format data info; setting `confidence` into relation's `trust`
|
||||||
DataInfo dataInfo = OafMapperUtils
|
DataInfo dataInfo = OafMapperUtils
|
||||||
.dataInfo(
|
.dataInfo(
|
||||||
false,
|
false,
|
||||||
BIP_INFERENCE_PROVENANCE,
|
BIP_INFERENCE_PROVENANCE,
|
||||||
true,
|
true,
|
||||||
false,
|
false,
|
||||||
qualifier,
|
qualifier,
|
||||||
Double.toString(row.getAs("confidence")));
|
Double.toString(row.getAs("confidence")));
|
||||||
|
|
||||||
// return bi-directional relations
|
// return bi-directional relations
|
||||||
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
||||||
|
|
||||||
})
|
})
|
||||||
.map(p -> new AtomicAction(Relation.class, p))
|
.map(p -> new AtomicAction(Relation.class, p))
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
||||||
|
|
|
@ -79,8 +79,8 @@ public class PrepareAffiliationRelationsTest {
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
String publisherAffiliationRelationPath = getClass()
|
String publisherAffiliationRelationPath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
String outputPath = workingDir.toString() + "/actionSet";
|
String outputPath = workingDir.toString() + "/actionSet";
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||||
"-publisherInputPath", publisherAffiliationRelationPath,
|
"-publisherInputPath", publisherAffiliationRelationPath,
|
||||||
"-outputPath", outputPath
|
"-outputPath", outputPath
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
@ -16,7 +17,8 @@ class EnrichMissingAuthorOrcidTest {
|
||||||
final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
|
final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
void setUp() throws Exception {}
|
void setUp() throws Exception {
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testFindDifferences_1() {
|
void testFindDifferences_1() {
|
||||||
|
|
|
@ -23,7 +23,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
public class ConversionUtilsTest {
|
public class ConversionUtilsTest {
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {}
|
public void setUp() throws Exception {
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAllResultPids() {
|
public void testAllResultPids() {
|
||||||
|
|
|
@ -20,7 +20,6 @@ import javax.xml.transform.*;
|
||||||
import javax.xml.transform.dom.DOMSource;
|
import javax.xml.transform.dom.DOMSource;
|
||||||
import javax.xml.transform.stream.StreamResult;
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.provision.model.*;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
|
@ -42,6 +41,7 @@ import com.google.common.collect.Sets;
|
||||||
import com.mycila.xmltool.XMLDoc;
|
import com.mycila.xmltool.XMLDoc;
|
||||||
import com.mycila.xmltool.XMLTag;
|
import com.mycila.xmltool.XMLTag;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.provision.model.*;
|
||||||
import eu.dnetlib.dhp.schema.common.*;
|
import eu.dnetlib.dhp.schema.common.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
Loading…
Reference in New Issue