[OpenCitation] added logic to avoid the genration of self citations relations

This commit is contained in:
Miriam Baglioni 2022-02-08 16:15:34 +01:00
parent b071f8e415
commit 759ed519f2
6 changed files with 24 additions and 13 deletions

View File

@ -115,18 +115,20 @@ public class CreateActionSetSparkJob implements Serializable {
final String cited = ID_PREFIX final String cited = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited())); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
relationList if(!citing.equals(cited)){
.addAll( relationList
getRelations( .addAll(
citing, getRelations(
cited)); citing,
cited));
if (duplicate && value.getCiting().endsWith(".refs")) { if (duplicate && value.getCiting().endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory citing = ID_PREFIX + IdentifierFactory
.md5( .md5(
CleaningFunctions CleaningFunctions
.normalizePidValue("doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs")))); .normalizePidValue("doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited)); relationList.addAll(getRelations(citing, cited));
}
} }
return relationList; return relationList;

View File

@ -103,6 +103,13 @@ public class ReadCOCITest {
.getPath()), .getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4")); new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4"));
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5"));
ReadCOCI ReadCOCI
.main( .main(
new String[] { new String[] {
@ -112,7 +119,7 @@ public class ReadCOCITest {
workingDir.toString() + "/COCI", workingDir.toString() + "/COCI",
"-outputPath", "-outputPath",
workingDir.toString() + "/COCI_json/", workingDir.toString() + "/COCI_json/",
"-inputFile", "input1;input2;input3;input4" "-inputFile", "input1;input2;input3;input4;input5"
}); });
@ -123,7 +130,7 @@ public class ReadCOCITest {
.textFile(workingDir.toString() + "/COCI_json/*/") .textFile(workingDir.toString() + "/COCI_json/*/")
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class)); .map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
Assertions.assertEquals(23, tmp.count()); Assertions.assertEquals(24, tmp.count());
Assertions.assertEquals(1, tmp.filter(c -> c.getCiting().equals("10.1207/s15327647jcd3,4-01")).count()); Assertions.assertEquals(1, tmp.filter(c -> c.getCiting().equals("10.1207/s15327647jcd3,4-01")).count());

View File

@ -0,0 +1,2 @@
oci,citing,cited,creation,timespan,journal_sc,author_sc
02001000007362801000805046300010563030608046333-02001000007362801000805046300010563030608046333,10.1007/s10854-015-3684-x,10.1007/s10854-015-3684-x,2015-09-01,P7Y2M,no,no