[beta] OpenAIRE Affiliation Inference #452
|
@ -42,6 +42,9 @@ public class Constants {
|
||||||
public static final String NULL = "NULL";
|
public static final String NULL = "NULL";
|
||||||
public static final String NA = "N/A";
|
public static final String NA = "N/A";
|
||||||
|
|
||||||
|
public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
||||||
|
public static final String WEB_CRAWL_NAME = "Web Crawl";
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
private Constants() {
|
private Constants() {
|
||||||
|
|
|
@ -41,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
private static final String ID_PREFIX = "50|doi_________::";
|
private static final String ID_PREFIX = "50|doi_________::";
|
||||||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
|
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
||||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
|
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
||||||
public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
|
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
||||||
|
|
||||||
public static <I extends Result> void main(String[] args) throws Exception {
|
public static <I extends Result> void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -71,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
final String dataciteInputPath = parser.get("dataciteInputPath");
|
final String dataciteInputPath = parser.get("dataciteInputPath");
|
||||||
log.info("dataciteInputPath: {}", dataciteInputPath);
|
log.info("dataciteInputPath: {}", dataciteInputPath);
|
||||||
|
|
||||||
|
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||||
|
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
@ -102,10 +105,16 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||||
spark, dataciteInputPath, collectedFromDatacite);
|
spark, dataciteInputPath, collectedFromDatacite);
|
||||||
|
|
||||||
|
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||||
|
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
|
||||||
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||||
|
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||||
|
|
||||||
crossrefRelations
|
crossrefRelations
|
||||||
.union(pubmedRelations)
|
.union(pubmedRelations)
|
||||||
.union(openAPCRelations)
|
.union(openAPCRelations)
|
||||||
.union(dataciteRelations)
|
.union(dataciteRelations)
|
||||||
|
.union(webCrawlRelations)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||||
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.actionmanager.Constants;
|
||||||
|
import io.netty.util.Constant;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
@ -44,8 +47,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
private static final String PMID_PREFIX = "50|pmid________::";
|
private static final String PMID_PREFIX = "50|pmid________::";
|
||||||
|
|
||||||
private static final String PMCID_PREFIX = "50|pmc_________::";
|
private static final String PMCID_PREFIX = "50|pmc_________::";
|
||||||
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
|
||||||
private static final String WEB_CRAWL_NAME = "Web Crawl";
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
@ -214,7 +216,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
|
||||||
OafMapperUtils
|
OafMapperUtils
|
||||||
.dataInfo(
|
.dataInfo(
|
||||||
false, null, false, false,
|
false, null, false, false,
|
||||||
|
@ -233,7 +235,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
|
||||||
OafMapperUtils
|
OafMapperUtils
|
||||||
.dataInfo(
|
.dataInfo(
|
||||||
false, null, false, false,
|
false, null, false, false,
|
||||||
|
|
|
@ -28,7 +28,13 @@
|
||||||
"paramLongName": "dataciteInputPath",
|
"paramLongName": "dataciteInputPath",
|
||||||
"paramDescription": "the path to get the input data from Datacite",
|
"paramDescription": "the path to get the input data from Datacite",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},{
|
||||||
|
"paramName": "wip",
|
||||||
|
"paramLongName": "webCrawlInputPath",
|
||||||
|
"paramDescription": "the path to get the input data from Web Crawl",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
,
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
|
|
|
@ -17,6 +17,10 @@
|
||||||
<name>dataciteInputPath</name>
|
<name>dataciteInputPath</name>
|
||||||
<description>the path where to find the inferred affiliation relations from Datacite</description>
|
<description>the path where to find the inferred affiliation relations from Datacite</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>webCrawlInputPath</name>
|
||||||
|
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
|
@ -112,7 +116,7 @@
|
||||||
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
|
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
|
||||||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||||
|
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||||
|
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||||
"-outputPath", outputPath
|
"-outputPath", outputPath
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// );
|
// );
|
||||||
// }
|
// }
|
||||||
// count the number of relations
|
// count the number of relations
|
||||||
assertEquals(80, tmp.count());
|
assertEquals(120, tmp.count());
|
||||||
|
|
||||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||||
dataset.createOrReplaceTempView("result");
|
dataset.createOrReplaceTempView("result");
|
||||||
|
@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// verify that we have equal number of bi-directional relations
|
// verify that we have equal number of bi-directional relations
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
40, execVerification
|
60, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
40, execVerification
|
60, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
|
|
@ -5,3 +5,5 @@
|
||||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||||
|
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||||
|
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
Loading…
Reference in New Issue