hostedbymap
parent
3d2bba3d5d
commit
baad01cadc
@ -1,54 +0,0 @@
|
||||
package eu.dnetlib.dhp.oa.graph.hostebymap
|
||||
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
|
||||
|
||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||
case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {}
|
||||
|
||||
object Aggregators {
|
||||
|
||||
|
||||
|
||||
def getId(s1:String, s2:String) : String = {
|
||||
if (!s1.equals("")){
|
||||
return s1}
|
||||
s2
|
||||
}
|
||||
|
||||
|
||||
def createHostedByItemTypes(df: Dataset[HostedByItemType]): Dataset[HostedByItemType] = {
|
||||
val transformedData : Dataset[HostedByItemType] = df
|
||||
.groupByKey(_.id)(Encoders.STRING)
|
||||
.agg(Aggregators.hostedByAggregator)
|
||||
.map{
|
||||
case (id:String , res:HostedByItemType) => res
|
||||
}(Encoders.product[HostedByItemType])
|
||||
|
||||
transformedData
|
||||
}
|
||||
|
||||
val hostedByAggregator: TypedColumn[HostedByItemType, HostedByItemType] = new Aggregator[HostedByItemType, HostedByItemType, HostedByItemType] {
|
||||
override def zero: HostedByItemType = HostedByItemType("","","","","",false)
|
||||
override def reduce(b: HostedByItemType, a:HostedByItemType): HostedByItemType = {
|
||||
return merge(b, a)
|
||||
}
|
||||
override def merge(b1: HostedByItemType, b2: HostedByItemType): HostedByItemType = {
|
||||
if (b1 == null){
|
||||
return b2
|
||||
}
|
||||
if(b2 == null){
|
||||
return b1
|
||||
}
|
||||
|
||||
HostedByItemType(getId(b1.id, b2.id), getId(b1.officialname, b2.officialname), getId(b1.issn, b2.issn), getId(b1.eissn, b2.eissn), getId(b1.lissn, b2.lissn), b1.openAccess || b2.openAccess)
|
||||
|
||||
}
|
||||
override def finish(reduction: HostedByItemType): HostedByItemType = reduction
|
||||
override def bufferEncoder: Encoder[HostedByItemType] = Encoders.product[HostedByItemType]
|
||||
|
||||
override def outputEncoder: Encoder[HostedByItemType] = Encoders.product[HostedByItemType]
|
||||
}.toColumn
|
||||
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
package eu.dnetlib.dhp.oa.graph.hostebymap;
|
||||
|
||||
public class Constants {
|
||||
|
||||
|
||||
|
||||
public static final String OPENAIRE = "openaire";
|
||||
public static final String DOAJ = "doaj";
|
||||
public static final String UNIBI = "unibi";
|
||||
|
||||
|
||||
public static final String ISSN = "issn";
|
||||
public static final String EISSN = "eissn";
|
||||
public static final String ISSNL = "issnl";
|
||||
}
|
@ -1,111 +0,0 @@
|
||||
package eu.dnetlib.dhp.oa.graph.hostebymap;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.opencsv.bean.CsvToBeanBuilder;
|
||||
import eu.dnetlib.dhp.oa.graph.hostebymap.model.UnibiGoldModel;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class GetCSV {
|
||||
private static final Log log = LogFactory.getLog(eu.dnetlib.dhp.oa.graph.hostebymap.GetCSV.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GetCSV.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/hostedbymap/download_csv_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String fileURL = parser.get("fileURL");
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
final String classForName = parser.get("classForName");
|
||||
final Boolean shouldReplace = Optional.ofNullable((parser.get("replace")))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(false);
|
||||
|
||||
|
||||
URLConnection connection = new URL(fileURL).openConnection();
|
||||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
|
||||
connection.connect();
|
||||
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));
|
||||
|
||||
if(shouldReplace){
|
||||
PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/DOAJ.csv")));
|
||||
String line = null;
|
||||
while((line = in.readLine())!= null){
|
||||
writer.println(line.replace("\\\"", "\""));
|
||||
}
|
||||
writer.close();
|
||||
in.close();
|
||||
in = new BufferedReader(new FileReader("/tmp/DOAJ.csv"));
|
||||
}
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, false);
|
||||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
|
||||
Class<?> clazz = Class.forName(classForName);
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
new CsvToBeanBuilder(in)
|
||||
.withType(clazz)
|
||||
.withMultilineLimit(1)
|
||||
.build()
|
||||
.parse()
|
||||
.forEach(line -> {
|
||||
try {
|
||||
writer.write(mapper.writeValueAsString(line));
|
||||
writer.newLine();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
|
||||
writer.close();
|
||||
in.close();
|
||||
if(shouldReplace){
|
||||
File f = new File("/tmp/DOAJ.csv");
|
||||
f.delete();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
@ -1,53 +0,0 @@
|
||||
package eu.dnetlib.dhp.oa.graph.hostebymap.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByName;
|
||||
|
||||
|
||||
public class DOAJModel implements Serializable {
|
||||
@CsvBindByName(column = "Journal title")
|
||||
private String journalTitle;
|
||||
|
||||
@CsvBindByName(column = "Journal ISSN (print version)")
|
||||
private String issn ;
|
||||
|
||||
@CsvBindByName(column = "Journal EISSN (online version)")
|
||||
private String eissn;
|
||||
|
||||
@CsvBindByName(column = "Review process")
|
||||
private String reviewProcess;
|
||||
|
||||
|
||||
public String getJournalTitle() {
|
||||
return journalTitle;
|
||||
}
|
||||
|
||||
public void setJournalTitle(String journalTitle) {
|
||||
this.journalTitle = journalTitle;
|
||||
}
|
||||
|
||||
public String getIssn() {
|
||||
return issn;
|
||||
}
|
||||
|
||||
public void setIssn(String issn) {
|
||||
this.issn = issn;
|
||||
}
|
||||
|
||||
public String getEissn() {
|
||||
return eissn;
|
||||
}
|
||||
|
||||
public void setEissn(String eissn) {
|
||||
this.eissn = eissn;
|
||||
}
|
||||
|
||||
public String getReviewProcess() {
|
||||
return reviewProcess;
|
||||
}
|
||||
|
||||
public void setReviewProcess(String reviewProcess) {
|
||||
this.reviewProcess = reviewProcess;
|
||||
}
|
||||
}
|
@ -1,44 +0,0 @@
|
||||
package eu.dnetlib.dhp.oa.graph.hostebymap.model;
|
||||
|
||||
import com.opencsv.bean.CsvBindByName;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class UnibiGoldModel implements Serializable {
|
||||
@CsvBindByName(column = "ISSN")
|
||||
private String issn;
|
||||
@CsvBindByName(column = "ISSN_L")
|
||||
private String issn_l;
|
||||
@CsvBindByName(column = "TITLE")
|
||||
private String title;
|
||||
@CsvBindByName(column = "TITLE_SOURCE")
|
||||
private String title_source;
|
||||
|
||||
public String getIssn() {
|
||||
return issn;
|
||||
}
|
||||
|
||||
public void setIssn(String issn) {
|
||||
this.issn = issn;
|
||||
}
|
||||
|
||||
public String getIssn_l() {
|
||||
return issn_l;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getTitle_source() {
|
||||
return title_source;
|
||||
}
|
||||
|
||||
public void setTitle_source(String title_source) {
|
||||
this.title_source = title_source;
|
||||
}
|
||||
}
|
@ -0,0 +1,97 @@
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
|
||||
|
||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||
case class HostedByInfo(id: String, officialname: String, journal_id: String, provenance : String, id_type: String) {}
|
||||
|
||||
object Aggregators {
|
||||
|
||||
|
||||
|
||||
def getId(s1:String, s2:String) : String = {
|
||||
if (s1.startsWith("10|")){
|
||||
return s1}
|
||||
s2
|
||||
}
|
||||
|
||||
def getValue(s1:String, s2:String) : String = {
|
||||
if(!s1.equals("")){
|
||||
return s1
|
||||
}
|
||||
s2
|
||||
}
|
||||
|
||||
|
||||
def createHostedByItemTypes(df: Dataset[HostedByItemType]): Dataset[HostedByItemType] = {
|
||||
val transformedData : Dataset[HostedByItemType] = df
|
||||
.groupByKey(_.id)(Encoders.STRING)
|
||||
.agg(Aggregators.hostedByAggregator)
|
||||
.map{
|
||||
case (id:String , res:HostedByItemType) => res
|
||||
}(Encoders.product[HostedByItemType])
|
||||
|
||||
transformedData
|
||||
}
|
||||
|
||||
val hostedByAggregator: TypedColumn[HostedByItemType, HostedByItemType] = new Aggregator[HostedByItemType, HostedByItemType, HostedByItemType] {
|
||||
override def zero: HostedByItemType = HostedByItemType("","","","","",false)
|
||||
override def reduce(b: HostedByItemType, a:HostedByItemType): HostedByItemType = {
|
||||
return merge(b, a)
|
||||
}
|
||||
override def merge(b1: HostedByItemType, b2: HostedByItemType): HostedByItemType = {
|
||||
if (b1 == null){
|
||||
return b2
|
||||
}
|
||||
if(b2 == null){
|
||||
return b1
|
||||
}
|
||||
|
||||
HostedByItemType(getId(b1.id, b2.id), getId(b1.officialname, b2.officialname), getId(b1.issn, b2.issn), getId(b1.eissn, b2.eissn), getId(b1.lissn, b2.lissn), b1.openAccess || b2.openAccess)
|
||||
|
||||
}
|
||||
override def finish(reduction: HostedByItemType): HostedByItemType = reduction
|
||||
override def bufferEncoder: Encoder[HostedByItemType] = Encoders.product[HostedByItemType]
|
||||
|
||||
override def outputEncoder: Encoder[HostedByItemType] = Encoders.product[HostedByItemType]
|
||||
}.toColumn
|
||||
|
||||
def explodeHostedByItemType(df: Dataset[(String, HostedByItemType)]): Dataset[(String, HostedByItemType)] = {
|
||||
val transformedData : Dataset[(String, HostedByItemType)] = df
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(Aggregators.hostedByAggregator1)
|
||||
.map{
|
||||
case (id:String , res:(String, HostedByItemType)) => res
|
||||
}(Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType]))
|
||||
|
||||
transformedData
|
||||
}
|
||||
|
||||
val hostedByAggregator1: TypedColumn[(String, HostedByItemType), (String, HostedByItemType)] = new Aggregator[(String, HostedByItemType), (String, HostedByItemType), (String, HostedByItemType)] {
|
||||
override def zero: (String, HostedByItemType) = ("", HostedByItemType("","","","","",false))
|
||||
override def reduce(b: (String, HostedByItemType), a:(String,HostedByItemType)): (String, HostedByItemType) = {
|
||||
return merge(b, a)
|
||||
}
|
||||
override def merge(b1: (String, HostedByItemType), b2: (String, HostedByItemType)): (String, HostedByItemType) = {
|
||||
if (b1 == null){
|
||||
return b2
|
||||
}
|
||||
if(b2 == null){
|
||||
return b1
|
||||
}
|
||||
if(b1._2.id.startsWith("10|")){
|
||||
return (b1._1, HostedByItemType(b1._2.id, b1._2.officialname, b1._2.issn, b1._2.eissn, b1._2.lissn, b1._2.openAccess || b2._2.openAccess))
|
||||
|
||||
}
|
||||
return (b2._1, HostedByItemType(b2._2.id, b2._2.officialname, b2._2.issn, b2._2.eissn, b2._2.lissn, b1._2.openAccess || b2._2.openAccess))
|
||||
|
||||
}
|
||||
override def finish(reduction: (String,HostedByItemType)): (String, HostedByItemType) = reduction
|
||||
override def bufferEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
|
||||
|
||||
override def outputEncoder: Encoder[(String,HostedByItemType)] = Encoders.tuple(Encoders.STRING,Encoders.product[HostedByItemType])
|
||||
}.toColumn
|
||||
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap;
|
||||
|
||||
public class Constants {
|
||||
|
||||
public static final String OPENAIRE = "openaire";
|
||||
public static final String DOAJ = "doaj";
|
||||
public static final String UNIBI = "unibi";
|
||||
|
||||
public static final String ISSN = "issn";
|
||||
public static final String EISSN = "eissn";
|
||||
public static final String ISSNL = "issnl";
|
||||
}
|
@ -0,0 +1,107 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.opencsv.bean.CsvToBeanBuilder;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetCSV {
|
||||
private static final Log log = LogFactory.getLog(eu.dnetlib.dhp.oa.graph.hostedbymap.GetCSV.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GetCSV.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/hostedbymap/download_csv_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String fileURL = parser.get("fileURL");
|
||||
final String hdfsPath = parser.get("workingPath");
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
final String classForName = parser.get("classForName");
|
||||
final Boolean shouldReplace = Optional
|
||||
.ofNullable((parser.get("replace")))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(false);
|
||||
|
||||
URLConnection connection = new URL(fileURL).openConnection();
|
||||
connection
|
||||
.setRequestProperty(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
|
||||
connection.connect();
|
||||
|
||||
BufferedReader in = new BufferedReader(
|
||||
new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));
|
||||
|
||||
if (shouldReplace) {
|
||||
PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/DOAJ.csv")));
|
||||
String line = null;
|
||||
while ((line = in.readLine()) != null) {
|
||||
writer.println(line.replace("\\\"", "\""));
|
||||
}
|
||||
writer.close();
|
||||
in.close();
|
||||
in = new BufferedReader(new FileReader("/tmp/DOAJ.csv"));
|
||||
}
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, false);
|
||||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
|
||||
Class<?> clazz = Class.forName(classForName);
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
new CsvToBeanBuilder(in)
|
||||
.withType(clazz)
|
||||
.withMultilineLimit(1)
|
||||
.build()
|
||||
.parse()
|
||||
.forEach(line -> {
|
||||
try {
|
||||
writer.write(mapper.writeValueAsString(line));
|
||||
writer.newLine();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
writer.close();
|
||||
in.close();
|
||||
if (shouldReplace) {
|
||||
File f = new File("/tmp/DOAJ.csv");
|
||||
f.delete();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByName;
|
||||
|
||||
public class DOAJModel implements Serializable {
|
||||
@CsvBindByName(column = "Journal title")
|
||||
private String journalTitle;
|
||||
|
||||
@CsvBindByName(column = "Journal ISSN (print version)")
|
||||
private String issn;
|
||||
|
||||
@CsvBindByName(column = "Journal EISSN (online version)")
|
||||
private String eissn;
|
||||
|
||||
@CsvBindByName(column = "Review process")
|
||||
private String reviewProcess;
|
||||
|
||||
public String getJournalTitle() {
|
||||
return journalTitle;
|
||||
}
|
||||
|
||||
public void setJournalTitle(String journalTitle) {
|
||||
this.journalTitle = journalTitle;
|
||||
}
|
||||
|
||||
public String getIssn() {
|
||||
return issn;
|
||||
}
|
||||
|
||||
public void setIssn(String issn) {
|
||||
this.issn = issn;
|
||||
}
|
||||
|
||||
public String getEissn() {
|
||||
return eissn;
|
||||
}
|
||||
|
||||
public void setEissn(String eissn) {
|
||||
this.eissn = eissn;
|
||||
}
|
||||
|
||||
public String getReviewProcess() {
|
||||
return reviewProcess;
|
||||
}
|
||||
|
||||
public void setReviewProcess(String reviewProcess) {
|
||||
this.reviewProcess = reviewProcess;
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByName;
|
||||
|
||||
public class UnibiGoldModel implements Serializable {
|
||||
@CsvBindByName(column = "ISSN")
|
||||
private String issn;
|
||||
@CsvBindByName(column = "ISSN_L")
|
||||
private String issn_l;
|
||||
@CsvBindByName(column = "TITLE")
|
||||
private String title;
|
||||
@CsvBindByName(column = "TITLE_SOURCE")
|
||||
private String title_source;
|
||||
|
||||
public String getIssn() {
|
||||
return issn;
|
||||
}
|
||||
|
||||
public void setIssn(String issn) {
|
||||
this.issn = issn;
|
||||
}
|
||||
|
||||
public String getIssn_l() {
|
||||
return issn_l;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getTitle_source() {
|
||||
return title_source;
|
||||
}
|
||||
|
||||
public void setTitle_source(String title_source) {
|
||||
this.title_source = title_source;
|
||||
}
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
|
||||
[
|
||||
|
||||
{
|
||||
"paramName":"fu",
|
||||
"paramLongName":"fileURL",
|
||||
"paramDescription": "the url to download the csv file ",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName":"wp",
|
||||
"paramLongName":"workingPath",
|
||||
"paramDescription": "the path where to find the pre-processed data for unibi gold list and doj artciles",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "hnn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the path used to store the HostedByMap",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "cfn",
|
||||
"paramLongName": "classForName",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "sr",
|
||||
"paramLongName": "replace",
|
||||
"paramDescription": "true if the input file has to be cleaned before parsing",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
||||
|
||||
|
@ -0,0 +1,38 @@
|
||||
|
||||
[
|
||||
|
||||
{
|
||||
"paramName":"dsp",
|
||||
"paramLongName":"datasourcePath",
|
||||
"paramDescription": "the path to the datasource ",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName":"wp",
|
||||
"paramLongName":"workingPath",
|
||||
"paramDescription": "the path where to find the pre-processed data for unibi gold list and doj artciles",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path used to store the HostedByMap",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ssm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
@ -0,0 +1,30 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1,148 @@
|
||||
<workflow-app name="hosted_by_map" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="reset_outputpath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="reset_outputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
</fs>
|
||||
<ok to="fork_downloads_csv"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<fork name="fork_downloads_csv">
|
||||
<path start="download_gold"/>
|
||||
<path start="download_doaj"/>
|
||||
</fork>
|
||||
|
||||
<action name="download_gold">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.GetCSV</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--fileURL</arg><arg>${unibiFileURL}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/unibi_gold</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel</arg>
|
||||
</java>
|
||||
<ok to="join_download"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="download_doaj">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.GetCSV</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--fileURL</arg><arg>${doajFileURL}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/doaj</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel</arg>
|
||||
<arg>--replace</arg><arg>true</arg>
|
||||
</java>
|
||||
<ok to="join_download"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<join name="join_download" to="produceHBM"/>
|
||||
|
||||
<action name="produceHBM">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<name>Produce the new HostedByMap</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.hostedbymap.SparkProduceHostedByMap</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--datasourcePath</arg><arg>${sourcePath}/datasource</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
@ -1,111 +1,109 @@
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.opencsv.bean.CsvToBeanBuilder;
|
||||
import eu.dnetlib.dhp.oa.graph.hostebymap.GetCSV;
|
||||
import eu.dnetlib.dhp.oa.graph.hostebymap.model.UnibiGoldModel;
|
||||
import org.junit.jupiter.api.Test;
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
public class TestReadCSV {
|
||||
|
||||
@Test
|
||||
public void testCSVUnibi() throws FileNotFoundException {
|
||||
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/hostedbymap/unibiGold.csv")
|
||||
.getPath();
|
||||
|
||||
List<UnibiGoldModel> beans = new CsvToBeanBuilder(new FileReader(sourcePath))
|
||||
.withType(UnibiGoldModel.class)
|
||||
.build()
|
||||
.parse();
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
beans.forEach(r -> {
|
||||
try {
|
||||
System.out.println(mapper.writeValueAsString(r));
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCSVUrlUnibi() throws IOException {
|
||||
|
||||
URL csv = new URL("https://pub.uni-bielefeld.de/download/2944717/2944718/issn_gold_oa_version_4.csv");
|
||||
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(csv.openStream()));
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
new CsvToBeanBuilder(in)
|
||||
.withType(eu.dnetlib.dhp.oa.graph.hostebymap.model.UnibiGoldModel.class)
|
||||
.build()
|
||||
.parse()
|
||||
.forEach(line ->
|
||||
|
||||
{
|
||||
try {
|
||||
System.out.println(mapper.writeValueAsString(line));
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCSVUrlDOAJ() throws IOException {
|
||||
|
||||
URLConnection connection = new URL("https://doaj.org/csv").openConnection();
|
||||
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
|
||||
connection.connect();
|
||||
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));
|
||||
//BufferedReader in = new BufferedReader(new FileReader("/tmp/DOAJ.csv"));
|
||||
PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/DOAJ_1.csv")));
|
||||
String line = null;
|
||||
while((line = in.readLine())!= null){
|
||||
writer.println(line.replace("\\\"", "\""));
|
||||
}
|
||||
writer.close();
|
||||
in.close();
|
||||
in = new BufferedReader(new FileReader("/tmp/DOAJ_1.csv"));
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
|
||||
|
||||
new CsvToBeanBuilder(in)
|
||||
.withType(eu.dnetlib.dhp.oa.graph.hostebymap.model.DOAJModel.class)
|
||||
.withMultilineLimit(1)
|
||||
.build()
|
||||
.parse()
|
||||
.forEach(lline ->
|
||||
|
||||
{
|
||||
try {
|
||||
System.out.println(mapper.writeValueAsString(lline));
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.opencsv.bean.CsvToBeanBuilder;
|
||||
|
||||
);
|
||||
}
|
||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel;
|
||||
|
||||
public class TestReadCSV {
|
||||
|
||||
@Test
|
||||
public void testCSVUnibi() throws FileNotFoundException {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/hostedbymap/unibiGold.csv")
|
||||
.getPath();
|
||||
|
||||
List<UnibiGoldModel> beans = new CsvToBeanBuilder(new FileReader(sourcePath))
|
||||
.withType(UnibiGoldModel.class)
|
||||
.build()
|
||||
.parse();
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
beans.forEach(r -> {
|
||||
try {
|
||||
System.out.println(mapper.writeValueAsString(r));
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCSVUrlUnibi() throws IOException {
|
||||
|
||||
URL csv = new URL("https://pub.uni-bielefeld.de/download/2944717/2944718/issn_gold_oa_version_4.csv");
|
||||
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(csv.openStream()));
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
new CsvToBeanBuilder(in)
|
||||
.withType(eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel.class)
|
||||
.build()
|
||||
.parse()
|
||||
.forEach(line ->
|
||||
|
||||
{
|
||||
try {
|
||||
System.out.println(mapper.writeValueAsString(line));
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCSVUrlDOAJ() throws IOException {
|
||||
|
||||
URLConnection connection = new URL("https://doaj.org/csv").openConnection();
|
||||
connection
|
||||
.setRequestProperty(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
|
||||
connection.connect();
|
||||
|
||||
BufferedReader in = new BufferedReader(
|
||||
new InputStreamReader(connection.getInputStream(), Charset.forName("UTF-8")));
|
||||
// BufferedReader in = new BufferedReader(new FileReader("/tmp/DOAJ.csv"));
|
||||
PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter("/tmp/DOAJ_1.csv")));
|
||||
String line = null;
|
||||
while ((line = in.readLine()) != null) {
|
||||
writer.println(line.replace("\\\"", "\""));
|
||||
}
|
||||
writer.close();
|
||||
in.close();
|
||||
in = new BufferedReader(new FileReader("/tmp/DOAJ_1.csv"));
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
new CsvToBeanBuilder(in)
|
||||
.withType(eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel.class)
|
||||
.withMultilineLimit(1)
|
||||
.build()
|
||||
.parse()
|
||||
.forEach(lline ->
|
||||
|
||||
{
|
||||
try {
|
||||
System.out.println(mapper.writeValueAsString(lline));
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
);
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue