forked from D-Net/dnet-hadoop
wf doi_authors generates one json data foreach row
This commit is contained in:
parent
fa1855a4b8
commit
5c65e602d3
|
@ -1,24 +1,25 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.orcid;
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class OrcidDOI {
|
public class OrcidDOI {
|
||||||
private String doi;
|
private String doi;
|
||||||
private List<AuthorData> authors;
|
private List<AuthorData> authors;
|
||||||
|
|
||||||
public String getDoi() {
|
public String getDoi() {
|
||||||
return doi;
|
return doi;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDoi(String doi) {
|
public void setDoi(String doi) {
|
||||||
this.doi = doi;
|
this.doi = doi;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<AuthorData> getAuthors() {
|
public List<AuthorData> getAuthors() {
|
||||||
return authors;
|
return authors;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setAuthors(List<AuthorData> authors) {
|
public void setAuthors(List<AuthorData> authors) {
|
||||||
this.authors = authors;
|
this.authors = authors;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,37 +3,32 @@ package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function;
|
import org.apache.spark.api.java.function.Function;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.rdd.RDD;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.esotericsoftware.minlog.Log;
|
import com.esotericsoftware.minlog.Log;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import com.google.gson.JsonElement;
|
import com.google.gson.JsonElement;
|
||||||
import com.google.gson.JsonParser;
|
import com.google.gson.JsonParser;
|
||||||
import com.ximpleware.ParseException;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.OrcidDOI;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
|
@ -154,11 +149,13 @@ public class SparkGenerateDoiAuthorList {
|
||||||
authorList.removeIf(a -> !oidsAlreadySeen.add(a.getOid()));
|
authorList.removeIf(a -> !oidsAlreadySeen.add(a.getOid()));
|
||||||
return new Tuple2<>(s._1(), authorList);
|
return new Tuple2<>(s._1(), authorList);
|
||||||
})
|
})
|
||||||
.mapToPair(
|
.map(s -> {
|
||||||
s -> {
|
OrcidDOI orcidDOI = new OrcidDOI();
|
||||||
return new Tuple2<>(s._1(), JsonWriter.create(s._2()));
|
orcidDOI.setDoi(s._1());
|
||||||
})
|
orcidDOI.setAuthors(s._2());
|
||||||
.saveAsTextFile(workingPath + outputDoiAuthorListPath);
|
return JsonWriter.create(orcidDOI);
|
||||||
|
})
|
||||||
|
.saveAsTextFile(workingPath + outputDoiAuthorListPath, GzipCodec.class);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>spark2MaxExecutors</name>
|
<name>spark2MaxExecutors</name>
|
||||||
<value>40</value>
|
<value>20</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozieActionShareLibForSpark2</name>
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
|
Loading…
Reference in New Issue