1
0
Fork 0

Merge pull request '[prod] Openaire Affiliation Inference' (#453) from affRoFromRawStringmain into main

Reviewed-on: D-Net/dnet-hadoop#453
This commit is contained in:
Claudio Atzori 2024-07-03 12:32:26 +02:00
commit ed97ba4565
8 changed files with 42 additions and 14 deletions

View File

@ -42,6 +42,9 @@ public class Constants {
public static final String NULL = "NULL";
public static final String NA = "N/A";
public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
public static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() {

View File

@ -9,6 +9,7 @@ import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
@ -40,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________::";
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
public static <I extends Result> void main(String[] args) throws Exception {
@ -70,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable {
final String dataciteInputPath = parser.get("dataciteInputPath");
log.info("dataciteInputPath: {}", dataciteInputPath);
final String webcrawlInputPath = parser.get("webCrawlInputPath");
log.info("webcrawlInputPath: {}", webcrawlInputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
@ -101,12 +105,18 @@ public class PrepareAffiliationRelations implements Serializable {
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedFromWebCrawl);
crossrefRelations
.union(pubmedRelations)
.union(openAPCRelations)
.union(dataciteRelations)
.union(webCrawlRelations)
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
});
}

View File

@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.Constants;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -29,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import io.netty.util.Constant;
import scala.Tuple2;
/**
@ -44,8 +46,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
private static final String PMID_PREFIX = "50|pmid________::";
private static final String PMCID_PREFIX = "50|pmc_________::";
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
private static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
@ -220,7 +221,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
Arrays
.asList(
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
OafMapperUtils
.dataInfo(
false, null, false, false,
@ -239,7 +240,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.HAS_AUTHOR_INSTITUTION,
Arrays
.asList(
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
OafMapperUtils
.dataInfo(
false, null, false, false,

View File

@ -28,7 +28,13 @@
"paramLongName": "dataciteInputPath",
"paramDescription": "the path to get the input data from Datacite",
"paramRequired": true
},
},{
"paramName": "wip",
"paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true
}
,
{
"paramName": "o",
"paramLongName": "outputPath",

View File

@ -35,5 +35,6 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
openapcInputPath=/data/bip-affiliations/openapc-data.json
dataciteInputPath=/data/bip-affiliations/datacite-data.json
webCrawlInputPath=/data/bip-affiliations/webCrawl/
outputPath=/tmp/crossref-affiliations-output-v5

View File

@ -17,6 +17,10 @@
<name>dataciteInputPath</name>
<description>the path where to find the inferred affiliation relations from Datacite</description>
</property>
<property>
<name>webCrawlInputPath</name>
<description>the path where to find the inferred affiliation relations from webCrawl</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
@ -112,7 +116,7 @@
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>

View File

@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest {
"-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath,
"-outputPath", outputPath
});
@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest {
// );
// }
// count the number of relations
assertEquals(80, tmp.count());
assertEquals(120, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result");
@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations
Assertions
.assertEquals(
40, execVerification
60, execVerification
.filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList()
@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest {
Assertions
.assertEquals(
40, execVerification
60, execVerification
.filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList()

View File

@ -4,4 +4,6 @@
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}