[dumpCSV] addressing the issues fointed out by the Dare Lab people. Repeated relations from author to result due to the author repeated in the data. Repeated relations from result to result due to the same pid present in more that one result. Author table not properly formatted due to the bad formatting of the input data
This commit is contained in:
parent
9d1b708a89
commit
3bfac8bc6e
|
@ -161,7 +161,7 @@ public class SparkDumpResults implements Serializable {
|
|||
|
||||
ar.autosetId();
|
||||
|
||||
if(!authorIds.contains(ar.getAuthorId())){
|
||||
if (!authorIds.contains(ar.getAuthorId())) {
|
||||
arl.add(ar);
|
||||
authorIds.add(ar.getAuthorId());
|
||||
}
|
||||
|
@ -193,17 +193,18 @@ public class SparkDumpResults implements Serializable {
|
|||
.mode(SaveMode.Overwrite)
|
||||
.json(workingPath + "/" + resultType + "/result_author");
|
||||
|
||||
// ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose the one from orcid if any
|
||||
// ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose
|
||||
// the one from orcid if any
|
||||
authorResult
|
||||
.groupByKey((MapFunction<AuthorResult, String>) ar -> ar.getAuthorId(), Encoders.STRING())
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, AuthorResult, CSVAuthor>) (k, it) -> {
|
||||
AuthorResult first = it.next();
|
||||
if(!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid())
|
||||
if (!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid())
|
||||
return getAuthorDump(first);
|
||||
while(it.hasNext()){
|
||||
while (it.hasNext()) {
|
||||
AuthorResult ar = it.next();
|
||||
if(ar.getFromOrcid())
|
||||
if (ar.getFromOrcid())
|
||||
return getAuthorDump(ar);
|
||||
}
|
||||
return getAuthorDump(first);
|
||||
|
@ -216,7 +217,7 @@ public class SparkDumpResults implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static String replace(String input){
|
||||
private static String replace(String input) {
|
||||
if (Optional.ofNullable(input).isPresent())
|
||||
return input.replace("\t", " ").replace("\n", " ").replace("\r", " ").replace("\"", " ");
|
||||
else
|
||||
|
|
|
@ -10,7 +10,6 @@ import java.nio.file.Path;
|
|||
import java.util.HashMap;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
@ -35,6 +34,7 @@ import org.slf4j.LoggerFactory;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
@ -207,7 +207,7 @@ public class DumpResultTest {
|
|||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
Dataset<CSVAuthor> tmp = Utils
|
||||
.readPath(spark, workingDir.toString() + "/working/publication/author", CSVAuthor.class);
|
||||
.readPath(spark, workingDir.toString() + "/working/publication/author", CSVAuthor.class);
|
||||
|
||||
Assertions.assertEquals(13, tmp.count());
|
||||
|
||||
|
|
Loading…
Reference in New Issue