forked from D-Net/dnet-hadoop
[AffiliationIngestion]Extended the ingestion of affiliation from open aire to include also links derived from Web Crawl. Extended the test. Inserted in Constatns the id and name of the webcrawl datasource to be used here and also in the ingestion of links from web crawl
This commit is contained in:
parent
14539f9c8b
commit
236b64d830
|
@ -42,6 +42,9 @@ public class Constants {
|
||||||
public static final String NULL = "NULL";
|
public static final String NULL = "NULL";
|
||||||
public static final String NA = "N/A";
|
public static final String NA = "N/A";
|
||||||
|
|
||||||
|
public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
||||||
|
public static final String WEB_CRAWL_NAME = "Web Crawl";
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
private Constants() {
|
private Constants() {
|
||||||
|
|
|
@ -41,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
private static final String ID_PREFIX = "50|doi_________::";
|
private static final String ID_PREFIX = "50|doi_________::";
|
||||||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
|
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
||||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
|
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
||||||
public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
|
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
||||||
|
|
||||||
public static <I extends Result> void main(String[] args) throws Exception {
|
public static <I extends Result> void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -71,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
final String dataciteInputPath = parser.get("dataciteInputPath");
|
final String dataciteInputPath = parser.get("dataciteInputPath");
|
||||||
log.info("dataciteInputPath: {}", dataciteInputPath);
|
log.info("dataciteInputPath: {}", dataciteInputPath);
|
||||||
|
|
||||||
|
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||||
|
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
@ -102,10 +105,16 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||||
spark, dataciteInputPath, collectedFromDatacite);
|
spark, dataciteInputPath, collectedFromDatacite);
|
||||||
|
|
||||||
|
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||||
|
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
|
||||||
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||||
|
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||||
|
|
||||||
crossrefRelations
|
crossrefRelations
|
||||||
.union(pubmedRelations)
|
.union(pubmedRelations)
|
||||||
.union(openAPCRelations)
|
.union(openAPCRelations)
|
||||||
.union(dataciteRelations)
|
.union(dataciteRelations)
|
||||||
|
.union(webCrawlRelations)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||||
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.actionmanager.Constants;
|
||||||
|
import io.netty.util.Constant;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
@ -44,8 +47,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
private static final String PMID_PREFIX = "50|pmid________::";
|
private static final String PMID_PREFIX = "50|pmid________::";
|
||||||
|
|
||||||
private static final String PMCID_PREFIX = "50|pmc_________::";
|
private static final String PMCID_PREFIX = "50|pmc_________::";
|
||||||
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
|
||||||
private static final String WEB_CRAWL_NAME = "Web Crawl";
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
@ -214,7 +216,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
|
||||||
OafMapperUtils
|
OafMapperUtils
|
||||||
.dataInfo(
|
.dataInfo(
|
||||||
false, null, false, false,
|
false, null, false, false,
|
||||||
|
@ -233,7 +235,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
|
||||||
OafMapperUtils
|
OafMapperUtils
|
||||||
.dataInfo(
|
.dataInfo(
|
||||||
false, null, false, false,
|
false, null, false, false,
|
||||||
|
|
|
@ -28,7 +28,13 @@
|
||||||
"paramLongName": "dataciteInputPath",
|
"paramLongName": "dataciteInputPath",
|
||||||
"paramDescription": "the path to get the input data from Datacite",
|
"paramDescription": "the path to get the input data from Datacite",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},{
|
||||||
|
"paramName": "wip",
|
||||||
|
"paramLongName": "webCrawlInputPath",
|
||||||
|
"paramDescription": "the path to get the input data from Web Crawl",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
,
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
|
|
|
@ -17,6 +17,10 @@
|
||||||
<name>dataciteInputPath</name>
|
<name>dataciteInputPath</name>
|
||||||
<description>the path where to find the inferred affiliation relations from Datacite</description>
|
<description>the path where to find the inferred affiliation relations from Datacite</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>webCrawlInputPath</name>
|
||||||
|
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
|
@ -112,7 +116,7 @@
|
||||||
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
|
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
|
||||||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||||
|
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||||
|
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||||
"-outputPath", outputPath
|
"-outputPath", outputPath
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// );
|
// );
|
||||||
// }
|
// }
|
||||||
// count the number of relations
|
// count the number of relations
|
||||||
assertEquals(80, tmp.count());
|
assertEquals(120, tmp.count());
|
||||||
|
|
||||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||||
dataset.createOrReplaceTempView("result");
|
dataset.createOrReplaceTempView("result");
|
||||||
|
@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// verify that we have equal number of bi-directional relations
|
// verify that we have equal number of bi-directional relations
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
40, execVerification
|
60, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
40, execVerification
|
60, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
|
|
@ -4,4 +4,6 @@
|
||||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||||
|
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||||
|
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
Loading…
Reference in New Issue