Master branch updates from beta September 2023 #337
|
@ -112,18 +112,19 @@ public class CreateActionSetSparkJob implements Serializable {
|
||||||
final String cited = ID_PREFIX
|
final String cited = ID_PREFIX
|
||||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
|
||||||
|
|
||||||
if(!citing.equals(cited)){
|
if (!citing.equals(cited)) {
|
||||||
relationList
|
relationList
|
||||||
.addAll(
|
.addAll(
|
||||||
getRelations(
|
getRelations(
|
||||||
citing,
|
citing,
|
||||||
cited));
|
cited));
|
||||||
|
|
||||||
if (duplicate && value.getCiting().endsWith(".refs")) {
|
if (duplicate && value.getCiting().endsWith(".refs")) {
|
||||||
citing = ID_PREFIX + IdentifierFactory
|
citing = ID_PREFIX + IdentifierFactory
|
||||||
.md5(
|
.md5(
|
||||||
CleaningFunctions
|
CleaningFunctions
|
||||||
.normalizePidValue("doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
|
.normalizePidValue(
|
||||||
|
"doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
|
||||||
relationList.addAll(getRelations(citing, cited));
|
relationList.addAll(getRelations(citing, cited));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,7 +72,7 @@ public class ReadCOCI implements Serializable {
|
||||||
String outputPath,
|
String outputPath,
|
||||||
String delimiter) throws IOException {
|
String delimiter) throws IOException {
|
||||||
|
|
||||||
for(String inputFile : inputFiles){
|
for (String inputFile : inputFiles) {
|
||||||
String p_string = workingPath + "/" + inputFile + ".gz";
|
String p_string = workingPath + "/" + inputFile + ".gz";
|
||||||
|
|
||||||
Dataset<Row> cociData = spark
|
Dataset<Row> cociData = spark
|
||||||
|
|
|
@ -12,7 +12,6 @@ public class COCI implements Serializable {
|
||||||
|
|
||||||
private String cited;
|
private String cited;
|
||||||
|
|
||||||
|
|
||||||
public String getOci() {
|
public String getOci() {
|
||||||
return oci;
|
return oci;
|
||||||
}
|
}
|
||||||
|
@ -37,5 +36,4 @@ public class COCI implements Serializable {
|
||||||
this.cited = cited;
|
this.cited = cited;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,53 +76,51 @@ public class ReadCOCITest {
|
||||||
|
|
||||||
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||||
fs
|
fs
|
||||||
.copyFromLocalFile(
|
.copyFromLocalFile(
|
||||||
false, new org.apache.hadoop.fs.Path(getClass()
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1.gz")
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1.gz")
|
||||||
.getPath()),
|
.getPath()),
|
||||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1.gz"));
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1.gz"));
|
||||||
|
|
||||||
fs
|
fs
|
||||||
.copyFromLocalFile(
|
.copyFromLocalFile(
|
||||||
false, new org.apache.hadoop.fs.Path(getClass()
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2.gz")
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2.gz")
|
||||||
.getPath()),
|
.getPath()),
|
||||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2.gz"));
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2.gz"));
|
||||||
|
|
||||||
fs
|
fs
|
||||||
.copyFromLocalFile(
|
.copyFromLocalFile(
|
||||||
false, new org.apache.hadoop.fs.Path(getClass()
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3.gz")
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3.gz")
|
||||||
.getPath()),
|
.getPath()),
|
||||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3.gz"));
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3.gz"));
|
||||||
|
|
||||||
fs
|
fs
|
||||||
.copyFromLocalFile(
|
.copyFromLocalFile(
|
||||||
false, new org.apache.hadoop.fs.Path(getClass()
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4.gz")
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4.gz")
|
||||||
.getPath()),
|
.getPath()),
|
||||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4.gz"));
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4.gz"));
|
||||||
|
|
||||||
fs
|
fs
|
||||||
.copyFromLocalFile(
|
.copyFromLocalFile(
|
||||||
false, new org.apache.hadoop.fs.Path(getClass()
|
false, new org.apache.hadoop.fs.Path(getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5.gz")
|
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5.gz")
|
||||||
.getPath()),
|
.getPath()),
|
||||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5.gz"));
|
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5.gz"));
|
||||||
|
|
||||||
ReadCOCI
|
ReadCOCI
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-isSparkSessionManaged",
|
"-isSparkSessionManaged",
|
||||||
Boolean.FALSE.toString(),
|
Boolean.FALSE.toString(),
|
||||||
"-workingPath",
|
"-workingPath",
|
||||||
workingDir.toString() + "/COCI",
|
workingDir.toString() + "/COCI",
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/COCI_json/",
|
workingDir.toString() + "/COCI_json/",
|
||||||
"-inputFile", "input1;input2;input3;input4;input5"
|
"-inputFile", "input1;input2;input3;input4;input5"
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue