[dumpCSV] addressing the issues fointed out by the Dare Lab people. Repeated relations from author to result due to the author repeated in the data. Repeated relations from result to result due to the same pid present in more that one result. Author table not properly formatted due to the bad formatting of the input data

This commit is contained in:
Miriam Baglioni 2023-07-07 18:01:26 +02:00
parent 9d1b708a89
commit 3bfac8bc6e
2 changed files with 9 additions and 8 deletions

View File

@ -161,7 +161,7 @@ public class SparkDumpResults implements Serializable {
ar.autosetId();
if(!authorIds.contains(ar.getAuthorId())){
if (!authorIds.contains(ar.getAuthorId())) {
arl.add(ar);
authorIds.add(ar.getAuthorId());
}
@ -193,17 +193,18 @@ public class SparkDumpResults implements Serializable {
.mode(SaveMode.Overwrite)
.json(workingPath + "/" + resultType + "/result_author");
// ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose the one from orcid if any
// ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose
// the one from orcid if any
authorResult
.groupByKey((MapFunction<AuthorResult, String>) ar -> ar.getAuthorId(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, AuthorResult, CSVAuthor>) (k, it) -> {
AuthorResult first = it.next();
if(!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid())
if (!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid())
return getAuthorDump(first);
while(it.hasNext()){
while (it.hasNext()) {
AuthorResult ar = it.next();
if(ar.getFromOrcid())
if (ar.getFromOrcid())
return getAuthorDump(ar);
}
return getAuthorDump(first);
@ -216,7 +217,7 @@ public class SparkDumpResults implements Serializable {
}
private static String replace(String input){
private static String replace(String input) {
if (Optional.ofNullable(input).isPresent())
return input.replace("\t", " ").replace("\n", " ").replace("\r", " ").replace("\"", " ");
else

View File

@ -10,7 +10,6 @@ import java.nio.file.Path;
import java.util.HashMap;
import java.util.Optional;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
@ -35,6 +34,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Publication;
@ -207,7 +207,7 @@ public class DumpResultTest {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Dataset<CSVAuthor> tmp = Utils
.readPath(spark, workingDir.toString() + "/working/publication/author", CSVAuthor.class);
.readPath(spark, workingDir.toString() + "/working/publication/author", CSVAuthor.class);
Assertions.assertEquals(13, tmp.count());