forked from D-Net/dnet-hadoop
Merge pull request 'HostedByMap' (#201) from hostedByMap_update into beta
Reviewed-on: D-Net/dnet-hadoop#201
This commit is contained in:
commit
9f3036c847
|
@ -0,0 +1,40 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.collection;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
public class DecompressTarGz {
|
||||
|
||||
public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException {
|
||||
|
||||
FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath));
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||
new GzipCompressorInputStream(inputFileStream))) {
|
||||
TarArchiveEntry entry = null;
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
if (!entry.isDirectory()) {
|
||||
try (
|
||||
FSDataOutputStream out = fs
|
||||
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
|
||||
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
||||
|
||||
IOUtils.copy(tais, gzipOs);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,19 +1,13 @@
|
|||
|
||||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.net.URI;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
import static eu.dnetlib.dhp.common.collection.DecompressTarGz.doExtract;
|
||||
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
@ -33,31 +27,16 @@ public class ExtractCrossrefRecords {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
final String crossrefFileNameTarGz = parser.get("crossrefFileNameTarGz");
|
||||
|
||||
Path hdfsreadpath = new Path(workingPath.concat("/").concat(crossrefFileNameTarGz));
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", workingPath);
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
FileSystem fs = FileSystem.get(URI.create(workingPath), conf);
|
||||
FSDataInputStream crossrefFileStream = fs.open(hdfsreadpath);
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||
new GzipCompressorInputStream(crossrefFileStream))) {
|
||||
TarArchiveEntry entry = null;
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
if (!entry.isDirectory()) {
|
||||
try (
|
||||
FSDataOutputStream out = fs
|
||||
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
|
||||
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
||||
|
||||
IOUtils.copy(tais, gzipOs);
|
||||
doExtract(fs, outputPath, workingPath.concat("/").concat(crossrefFileNameTarGz));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.info("Crossref dump reading completed");
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ public class DownloadCSV {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(DownloadCSV.class);
|
||||
|
||||
public static final char DEFAULT_DELIMITER = ';';
|
||||
public static final char DEFAULT_DELIMITER = ',';
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
|
@ -40,9 +40,6 @@ public class DownloadCSV {
|
|||
final String fileURL = parser.get("fileURL");
|
||||
log.info("fileURL {}", fileURL);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
final String outputFile = parser.get("outputFile");
|
||||
log.info("outputFile {}", outputFile);
|
||||
|
||||
|
@ -63,31 +60,15 @@ public class DownloadCSV {
|
|||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
new DownloadCSV().doDownload(fileURL, workingPath, outputFile, classForName, delimiter, fileSystem);
|
||||
new DownloadCSV().doDownload(fileURL, outputFile, classForName, delimiter, fileSystem);
|
||||
|
||||
}
|
||||
|
||||
protected void doDownload(String fileURL, String workingPath, String outputFile, String classForName,
|
||||
protected void doDownload(String fileURL, String outputFile, String classForName,
|
||||
char delimiter, FileSystem fs)
|
||||
throws IOException, ClassNotFoundException, CollectorException {
|
||||
|
||||
final HttpConnector2 connector2 = new HttpConnector2();
|
||||
|
||||
final Path path = new Path(workingPath + "/replaced.csv");
|
||||
|
||||
try (BufferedReader in = new BufferedReader(
|
||||
new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) {
|
||||
|
||||
try (PrintWriter writer = new PrintWriter(
|
||||
new OutputStreamWriter(fs.create(path, true), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = in.readLine()) != null) {
|
||||
writer.println(line.replace("\\\"", "\""));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (InputStreamReader reader = new InputStreamReader(fs.open(path))) {
|
||||
try (InputStreamReader reader = new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))) {
|
||||
GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,84 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.GetCSV;
|
||||
import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
||||
|
||||
public class DownloadCSV2 {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DownloadCSV2.class);
|
||||
|
||||
public static final char DEFAULT_DELIMITER = ';';
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
DownloadCSV2.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/hostedbymap/download_csv_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String fileURL = parser.get("fileURL");
|
||||
log.info("fileURL {}", fileURL);
|
||||
|
||||
final String tmpFile = parser.get("tmpFile");
|
||||
log.info("tmpFile {}", tmpFile);
|
||||
|
||||
final String outputFile = parser.get("outputFile");
|
||||
log.info("outputFile {}", outputFile);
|
||||
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
log.info("hdfsNameNode {}", hdfsNameNode);
|
||||
|
||||
final String classForName = parser.get("classForName");
|
||||
log.info("classForName {}", classForName);
|
||||
|
||||
final char delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.map(s -> s.charAt(0))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
log.info("delimiter {}", delimiter);
|
||||
|
||||
HttpConnector2 connector2 = new HttpConnector2();
|
||||
|
||||
try (BufferedReader in = new BufferedReader(
|
||||
new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) {
|
||||
|
||||
try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(tmpFile)))) {
|
||||
String line;
|
||||
while ((line = in.readLine()) != null) {
|
||||
writer.println(line.replace("\\\"", "\""));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try (BufferedReader in = new BufferedReader(new FileReader(tmpFile))) {
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
GetCSV.getCsv(fileSystem, in, outputFile, classForName, delimiter);
|
||||
} finally {
|
||||
FileUtils.deleteQuietly(new File(tmpFile));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap;
|
||||
|
||||
import static eu.dnetlib.dhp.common.collection.DecompressTarGz.doExtract;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.apache.hadoop.io.compress.CompressionInputStream;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel;
|
||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj.DOAJEntry;
|
||||
|
||||
public class ExtractAndMapDoajJson {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ExtractAndMapDoajJson.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
ExtractAndMapDoajJson.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/hostedbymap/download_json_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String compressedInput = parser.get("compressedFile");
|
||||
log.info("compressedInput {}", compressedInput);
|
||||
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
log.info("hdfsNameNode {}", hdfsNameNode);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fs = FileSystem.get(conf);
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||
CompressionCodec codec = factory.getCodecByClassName("org.apache.hadoop.io.compress.GzipCodec");
|
||||
doExtract(fs, workingPath, compressedInput);
|
||||
doMap(fs, workingPath, outputPath, codec);
|
||||
|
||||
}
|
||||
|
||||
private static void doMap(FileSystem fs, String workingPath, String outputPath, CompressionCodec codec)
|
||||
throws IOException {
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fs
|
||||
.listFiles(
|
||||
new Path(workingPath), true);
|
||||
|
||||
Path hdfsWritePath = new Path(outputPath);
|
||||
if (fs.exists(hdfsWritePath)) {
|
||||
fs.delete(hdfsWritePath, true);
|
||||
|
||||
}
|
||||
try (
|
||||
|
||||
FSDataOutputStream out = fs
|
||||
.create(hdfsWritePath);
|
||||
PrintWriter writer = new PrintWriter(new BufferedOutputStream(out))) {
|
||||
|
||||
while (fileStatusListIterator.hasNext()) {
|
||||
Path path = fileStatusListIterator.next().getPath();
|
||||
if (!fs.isDirectory(path)) {
|
||||
FSDataInputStream is = fs.open(path);
|
||||
CompressionInputStream compressionInputStream = codec.createInputStream(is);
|
||||
DOAJEntry[] doajEntries = new ObjectMapper().readValue(compressionInputStream, DOAJEntry[].class);
|
||||
Arrays.stream(doajEntries).forEach(doaj -> {
|
||||
try {
|
||||
writer.println(new ObjectMapper().writeValueAsString(getDoajModel(doaj)));
|
||||
} catch (JsonProcessingException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@NotNull
|
||||
public static DOAJModel getDoajModel(DOAJEntry doaj) {
|
||||
DOAJModel doajModel = new DOAJModel();
|
||||
doajModel.setOaStart(doaj.getBibjson().getOa_start());
|
||||
doajModel.setEissn(doaj.getBibjson().getEissn());
|
||||
doajModel.setIssn(doaj.getBibjson().getPissn());
|
||||
doajModel.setJournalTitle(doaj.getBibjson().getTitle());
|
||||
doajModel.setReviewProcess(doaj.getBibjson().getEditorial().getReview_process());
|
||||
return doajModel;
|
||||
}
|
||||
|
||||
}
|
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.opencsv.bean.CsvBindByName;
|
||||
|
||||
|
@ -17,7 +18,17 @@ public class DOAJModel implements Serializable {
|
|||
private String eissn;
|
||||
|
||||
@CsvBindByName(column = "Review process")
|
||||
private String reviewProcess;
|
||||
private List<String> reviewProcess;
|
||||
|
||||
private Integer oaStart;
|
||||
|
||||
public Integer getOaStart() {
|
||||
return oaStart;
|
||||
}
|
||||
|
||||
public void setOaStart(Integer oaStart) {
|
||||
this.oaStart = oaStart;
|
||||
}
|
||||
|
||||
public String getJournalTitle() {
|
||||
return journalTitle;
|
||||
|
@ -43,11 +54,11 @@ public class DOAJModel implements Serializable {
|
|||
this.eissn = eissn;
|
||||
}
|
||||
|
||||
public String getReviewProcess() {
|
||||
public List<String> getReviewProcess() {
|
||||
return reviewProcess;
|
||||
}
|
||||
|
||||
public void setReviewProcess(String reviewProcess) {
|
||||
public void setReviewProcess(List<String> reviewProcess) {
|
||||
this.reviewProcess = reviewProcess;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class APC implements Serializable {
|
||||
private Boolean has_apc;
|
||||
private String url;
|
||||
private List<Max> max;
|
||||
|
||||
public List<Max> getMax() {
|
||||
return max;
|
||||
}
|
||||
|
||||
public void setMax(List<Max> max) {
|
||||
this.max = max;
|
||||
}
|
||||
|
||||
public Boolean getHas_apc() {
|
||||
return has_apc;
|
||||
}
|
||||
|
||||
public void setHas_apc(Boolean has_apc) {
|
||||
this.has_apc = has_apc;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Admin implements Serializable {
|
||||
private Boolean ticked;
|
||||
private Boolean seal;
|
||||
|
||||
public Boolean getTicked() {
|
||||
return ticked;
|
||||
}
|
||||
|
||||
public void setTicked(Boolean ticked) {
|
||||
this.ticked = ticked;
|
||||
}
|
||||
|
||||
public Boolean getSeal() {
|
||||
return seal;
|
||||
}
|
||||
|
||||
public void setSeal(Boolean seal) {
|
||||
this.seal = seal;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Article implements Serializable {
|
||||
private String license_display_example_url;
|
||||
private List<String> license_display;
|
||||
private Boolean orcid;
|
||||
private Boolean i4oc_open_citations;
|
||||
|
||||
public String getLicense_display_example_url() {
|
||||
return license_display_example_url;
|
||||
}
|
||||
|
||||
public void setLicense_display_example_url(String license_display_example_url) {
|
||||
this.license_display_example_url = license_display_example_url;
|
||||
}
|
||||
|
||||
public List<String> getLicense_display() {
|
||||
return license_display;
|
||||
}
|
||||
|
||||
public void setLicense_display(List<String> license_display) {
|
||||
this.license_display = license_display;
|
||||
}
|
||||
|
||||
public Boolean getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(Boolean orcid) {
|
||||
this.orcid = orcid;
|
||||
}
|
||||
|
||||
public Boolean getI4oc_open_citations() {
|
||||
return i4oc_open_citations;
|
||||
}
|
||||
|
||||
public void setI4oc_open_citations(Boolean i4oc_open_citations) {
|
||||
this.i4oc_open_citations = i4oc_open_citations;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,253 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
|
||||
public class BibJson implements Serializable {
|
||||
private Editorial editorial;
|
||||
private PidScheme pid_scheme;
|
||||
private Copyright copyright;
|
||||
private List<String> keywords;
|
||||
private Plagiarism plagiarism;
|
||||
private List<Subject> subject;
|
||||
private String eissn;
|
||||
private String pissn;
|
||||
private List<String> language;
|
||||
private String title;
|
||||
private Article article;
|
||||
private Institution institution;
|
||||
private Preservation preservation;
|
||||
private List<License> license;
|
||||
private Ref ref;
|
||||
private Integer oa_start;
|
||||
private APC apc;
|
||||
private OtherCharges other_charges;
|
||||
private Integer publication_time_weeks;
|
||||
private DepositPolicy deposit_policy;
|
||||
private Publisher publisher;
|
||||
private Boolean boai;
|
||||
private Waiver waiver;
|
||||
private String alternative_title;
|
||||
private List<String> is_replaced_by;
|
||||
private List<String> replaces;
|
||||
private String discontinued_date;
|
||||
|
||||
public String getDiscontinued_date() {
|
||||
return discontinued_date;
|
||||
}
|
||||
|
||||
public void setDiscontinued_date(String discontinued_date) {
|
||||
this.discontinued_date = discontinued_date;
|
||||
}
|
||||
|
||||
public List<String> getReplaces() {
|
||||
return replaces;
|
||||
}
|
||||
|
||||
public void setReplaces(List<String> replaces) {
|
||||
this.replaces = replaces;
|
||||
}
|
||||
|
||||
public List<String> getIs_replaced_by() {
|
||||
return is_replaced_by;
|
||||
}
|
||||
|
||||
public void setIs_replaced_by(List<String> is_replaced_by) {
|
||||
this.is_replaced_by = is_replaced_by;
|
||||
}
|
||||
|
||||
public String getAlternative_title() {
|
||||
return alternative_title;
|
||||
}
|
||||
|
||||
public void setAlternative_title(String alternative_title) {
|
||||
this.alternative_title = alternative_title;
|
||||
}
|
||||
|
||||
public String getPissn() {
|
||||
return pissn;
|
||||
}
|
||||
|
||||
public void setPissn(String pissn) {
|
||||
this.pissn = pissn;
|
||||
}
|
||||
|
||||
public Editorial getEditorial() {
|
||||
return editorial;
|
||||
}
|
||||
|
||||
public void setEditorial(Editorial editorial) {
|
||||
this.editorial = editorial;
|
||||
}
|
||||
|
||||
public PidScheme getPid_scheme() {
|
||||
return pid_scheme;
|
||||
}
|
||||
|
||||
public void setPid_scheme(PidScheme pid_scheme) {
|
||||
this.pid_scheme = pid_scheme;
|
||||
}
|
||||
|
||||
public Copyright getCopyright() {
|
||||
return copyright;
|
||||
}
|
||||
|
||||
public void setCopyright(Copyright copyright) {
|
||||
this.copyright = copyright;
|
||||
}
|
||||
|
||||
public List<String> getKeywords() {
|
||||
return keywords;
|
||||
}
|
||||
|
||||
public void setKeywords(List<String> keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
public Plagiarism getPlagiarism() {
|
||||
return plagiarism;
|
||||
}
|
||||
|
||||
public void setPlagiarism(Plagiarism plagiarism) {
|
||||
this.plagiarism = plagiarism;
|
||||
}
|
||||
|
||||
public List<Subject> getSubject() {
|
||||
return subject;
|
||||
}
|
||||
|
||||
public void setSubject(List<Subject> subject) {
|
||||
this.subject = subject;
|
||||
}
|
||||
|
||||
public String getEissn() {
|
||||
return eissn;
|
||||
}
|
||||
|
||||
public void setEissn(String eissn) {
|
||||
this.eissn = eissn;
|
||||
}
|
||||
|
||||
public List<String> getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(List<String> language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public Article getArticle() {
|
||||
return article;
|
||||
}
|
||||
|
||||
public void setArticle(Article article) {
|
||||
this.article = article;
|
||||
}
|
||||
|
||||
public Institution getInstitution() {
|
||||
return institution;
|
||||
}
|
||||
|
||||
public void setInstitution(Institution institution) {
|
||||
this.institution = institution;
|
||||
}
|
||||
|
||||
public Preservation getPreservation() {
|
||||
return preservation;
|
||||
}
|
||||
|
||||
public void setPreservation(Preservation preservation) {
|
||||
this.preservation = preservation;
|
||||
}
|
||||
|
||||
public List<License> getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
||||
public void setLicense(List<License> license) {
|
||||
this.license = license;
|
||||
}
|
||||
|
||||
public Ref getRef() {
|
||||
return ref;
|
||||
}
|
||||
|
||||
public void setRef(Ref ref) {
|
||||
this.ref = ref;
|
||||
}
|
||||
|
||||
public Integer getOa_start() {
|
||||
return oa_start;
|
||||
}
|
||||
|
||||
public void setOa_start(Integer oa_start) {
|
||||
this.oa_start = oa_start;
|
||||
}
|
||||
|
||||
public APC getApc() {
|
||||
return apc;
|
||||
}
|
||||
|
||||
public void setApc(APC apc) {
|
||||
this.apc = apc;
|
||||
}
|
||||
|
||||
public OtherCharges getOther_charges() {
|
||||
return other_charges;
|
||||
}
|
||||
|
||||
public void setOther_charges(OtherCharges other_charges) {
|
||||
this.other_charges = other_charges;
|
||||
}
|
||||
|
||||
public Integer getPublication_time_weeks() {
|
||||
return publication_time_weeks;
|
||||
}
|
||||
|
||||
public void setPublication_time_weeks(Integer publication_time_weeks) {
|
||||
this.publication_time_weeks = publication_time_weeks;
|
||||
}
|
||||
|
||||
public DepositPolicy getDeposit_policy() {
|
||||
return deposit_policy;
|
||||
}
|
||||
|
||||
public void setDeposit_policy(DepositPolicy deposit_policy) {
|
||||
this.deposit_policy = deposit_policy;
|
||||
}
|
||||
|
||||
public Publisher getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(Publisher publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public Boolean getBoai() {
|
||||
return boai;
|
||||
}
|
||||
|
||||
public void setBoai(Boolean boai) {
|
||||
this.boai = boai;
|
||||
}
|
||||
|
||||
public Waiver getWaiver() {
|
||||
return waiver;
|
||||
}
|
||||
|
||||
public void setWaiver(Waiver waiver) {
|
||||
this.waiver = waiver;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Copyright implements Serializable {
|
||||
private Boolean author_retains;
|
||||
private String url;
|
||||
|
||||
public Boolean getAuthor_retains() {
|
||||
return author_retains;
|
||||
}
|
||||
|
||||
public void setAuthor_retains(Boolean author_retains) {
|
||||
this.author_retains = author_retains;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class DOAJEntry implements Serializable {
|
||||
private String last_updated;
|
||||
private BibJson bibjson;
|
||||
private Admin admin;
|
||||
private String created_date;
|
||||
private String id;
|
||||
|
||||
public String getLast_updated() {
|
||||
return last_updated;
|
||||
}
|
||||
|
||||
public void setLast_updated(String last_updated) {
|
||||
this.last_updated = last_updated;
|
||||
}
|
||||
|
||||
public BibJson getBibjson() {
|
||||
return bibjson;
|
||||
}
|
||||
|
||||
public void setBibjson(BibJson bibjson) {
|
||||
this.bibjson = bibjson;
|
||||
}
|
||||
|
||||
public Admin getAdmin() {
|
||||
return admin;
|
||||
}
|
||||
|
||||
public void setAdmin(Admin admin) {
|
||||
this.admin = admin;
|
||||
}
|
||||
|
||||
public String getCreated_date() {
|
||||
return created_date;
|
||||
}
|
||||
|
||||
public void setCreated_date(String created_date) {
|
||||
this.created_date = created_date;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class DepositPolicy implements Serializable {
|
||||
private List<String> service;
|
||||
private String url;
|
||||
private Boolean has_policy;
|
||||
|
||||
public List<String> getService() {
|
||||
return service;
|
||||
}
|
||||
|
||||
public void setService(List<String> service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public Boolean getHas_policy() {
|
||||
return has_policy;
|
||||
}
|
||||
|
||||
public void setHas_policy(Boolean has_policy) {
|
||||
this.has_policy = has_policy;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Editorial implements Serializable {
|
||||
private List<String> review_process;
|
||||
private String review_url;
|
||||
private String board_url;
|
||||
|
||||
public List<String> getReview_process() {
|
||||
return review_process;
|
||||
}
|
||||
|
||||
public void setReview_process(List<String> review_process) {
|
||||
this.review_process = review_process;
|
||||
}
|
||||
|
||||
public String getReview_url() {
|
||||
return review_url;
|
||||
}
|
||||
|
||||
public void setReview_url(String review_url) {
|
||||
this.review_url = review_url;
|
||||
}
|
||||
|
||||
public String getBoard_url() {
|
||||
return board_url;
|
||||
}
|
||||
|
||||
public void setBoard_url(String board_url) {
|
||||
this.board_url = board_url;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Institution implements Serializable {
|
||||
private String country;
|
||||
private String name;
|
||||
|
||||
public String getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(String country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class License implements Serializable {
|
||||
private Boolean nc;
|
||||
private Boolean nd;
|
||||
private Boolean by;
|
||||
private String type;
|
||||
private Boolean sa;
|
||||
private String url;
|
||||
|
||||
public Boolean getnC() {
|
||||
return nc;
|
||||
}
|
||||
|
||||
@JsonProperty("NC")
|
||||
public void setnC(Boolean NC) {
|
||||
this.nc = NC;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public Boolean getNd() {
|
||||
return nd;
|
||||
}
|
||||
|
||||
@JsonProperty("ND")
|
||||
public void setNd(Boolean nd) {
|
||||
this.nd = nd;
|
||||
}
|
||||
|
||||
public Boolean getBy() {
|
||||
return by;
|
||||
}
|
||||
|
||||
@JsonProperty("BY")
|
||||
public void setBy(Boolean by) {
|
||||
this.by = by;
|
||||
}
|
||||
|
||||
public Boolean getSa() {
|
||||
return sa;
|
||||
}
|
||||
|
||||
@JsonProperty("SA")
|
||||
public void setSa(Boolean sa) {
|
||||
this.sa = sa;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Max implements Serializable {
|
||||
private Integer price;
|
||||
private String currency;
|
||||
|
||||
public Integer getPrice() {
|
||||
return price;
|
||||
}
|
||||
|
||||
public void setPrice(Integer price) {
|
||||
this.price = price;
|
||||
}
|
||||
|
||||
public String getCurrency() {
|
||||
return currency;
|
||||
}
|
||||
|
||||
public void setCurrency(String currency) {
|
||||
this.currency = currency;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class OtherCharges implements Serializable {
|
||||
private Boolean has_other_charges;
|
||||
private String url;
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public Boolean getHas_other_charges() {
|
||||
return has_other_charges;
|
||||
}
|
||||
|
||||
public void setHas_other_charges(Boolean has_other_charges) {
|
||||
this.has_other_charges = has_other_charges;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class PidScheme implements Serializable {
|
||||
private List<String> scheme;
|
||||
private Boolean has_pid_scheme;
|
||||
|
||||
public List<String> getScheme() {
|
||||
return scheme;
|
||||
}
|
||||
|
||||
public void setScheme(List<String> scheme) {
|
||||
this.scheme = scheme;
|
||||
}
|
||||
|
||||
public Boolean getHas_pid_scheme() {
|
||||
return has_pid_scheme;
|
||||
}
|
||||
|
||||
public void setHas_pid_scheme(Boolean has_pid_scheme) {
|
||||
this.has_pid_scheme = has_pid_scheme;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import javax.sql.rowset.serial.SerialArray;
|
||||
|
||||
public class Plagiarism implements Serializable {
|
||||
private Boolean detection;
|
||||
private String url;
|
||||
|
||||
public Boolean getDetection() {
|
||||
return detection;
|
||||
}
|
||||
|
||||
public void setDetection(Boolean detection) {
|
||||
this.detection = detection;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Preservation implements Serializable {
|
||||
private Boolean has_preservation;
|
||||
private List<String> service;
|
||||
private List<String> national_library;
|
||||
private String url;
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public Boolean getHas_preservation() {
|
||||
return has_preservation;
|
||||
}
|
||||
|
||||
public void setHas_preservation(Boolean has_preservation) {
|
||||
this.has_preservation = has_preservation;
|
||||
}
|
||||
|
||||
public List<String> getService() {
|
||||
return service;
|
||||
}
|
||||
|
||||
public void setService(List<String> service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public List<String> getNational_library() {
|
||||
return national_library;
|
||||
}
|
||||
|
||||
public void setNational_library(List<String> national_library) {
|
||||
this.national_library = national_library;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Publisher implements Serializable {
|
||||
private String country;
|
||||
private String name;
|
||||
|
||||
public String getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(String country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Ref implements Serializable {
|
||||
private String aims_scope;
|
||||
private String journal;
|
||||
private String oa_statement;
|
||||
private String author_instructions;
|
||||
private String license_terms;
|
||||
|
||||
public String getAims_scope() {
|
||||
return aims_scope;
|
||||
}
|
||||
|
||||
public void setAims_scope(String aims_scope) {
|
||||
this.aims_scope = aims_scope;
|
||||
}
|
||||
|
||||
public String getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(String journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public String getOa_statement() {
|
||||
return oa_statement;
|
||||
}
|
||||
|
||||
public void setOa_statement(String oa_statement) {
|
||||
this.oa_statement = oa_statement;
|
||||
}
|
||||
|
||||
public String getAuthor_instructions() {
|
||||
return author_instructions;
|
||||
}
|
||||
|
||||
public void setAuthor_instructions(String author_instructions) {
|
||||
this.author_instructions = author_instructions;
|
||||
}
|
||||
|
||||
public String getLicense_terms() {
|
||||
return license_terms;
|
||||
}
|
||||
|
||||
public void setLicense_terms(String license_terms) {
|
||||
this.license_terms = license_terms;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Subject implements Serializable {
|
||||
private String code;
|
||||
private String scheme;
|
||||
private String term;
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public String getScheme() {
|
||||
return scheme;
|
||||
}
|
||||
|
||||
public void setScheme(String scheme) {
|
||||
this.scheme = scheme;
|
||||
}
|
||||
|
||||
public String getTerm() {
|
||||
return term;
|
||||
}
|
||||
|
||||
public void setTerm(String term) {
|
||||
this.term = term;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Waiver implements Serializable {
|
||||
private Boolean has_waiver;
|
||||
private String url;
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
public Boolean getHas_waiver() {
|
||||
return has_waiver;
|
||||
}
|
||||
|
||||
public void setHas_waiver(Boolean has_waiver) {
|
||||
this.has_waiver = has_waiver;
|
||||
}
|
||||
}
|
|
@ -74,7 +74,9 @@
|
|||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="produceHBM">${wf:conf('resumeFrom') eq 'ProduceHBM'}</case>
|
||||
<case to="remove_hbmpath">${wf:conf('resumeFrom') eq 'download_csv'}</case>
|
||||
<case to="fork_downloads_csv">${wf:conf('resumeFrom') eq 'DownloadBoth'}</case>
|
||||
<case to="downloadGold">${wf:conf('resumeFrom') eq 'DownloadGold'}</case>
|
||||
<case to="downloadDOAJ">${wf:conf('resumeFrom') eq 'DownloadDoaj'}</case>
|
||||
<default to="prepareInfo"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -83,18 +85,9 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="remove_hbmpath">
|
||||
<fs>
|
||||
<delete path="${hostedByMapPath}"/>
|
||||
<!-- <mkdir path="${hostedByMapPath}"/>-->
|
||||
</fs>
|
||||
<ok to="fork_downloads_csv"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="fork_downloads_csv">
|
||||
<fork name="fork_downloads_csv">
|
||||
<path start="download_gold"/>
|
||||
<path start="download_doaj"/>
|
||||
<path start="download_doaj_json"/>
|
||||
</fork>
|
||||
|
||||
<action name="download_gold">
|
||||
|
@ -103,21 +96,43 @@
|
|||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--fileURL</arg><arg>${unibiFileURL}</arg>
|
||||
<arg>--tmpFile</arg><arg>/tmp/unibi_gold_replaced.csv</arg>
|
||||
<arg>--outputFile</arg><arg>${workingDir}/unibi_gold.json</arg>
|
||||
<arg>--outputFile</arg><arg>/user/${wf:user()}/data/unibi_gold.json</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel</arg>
|
||||
</java>
|
||||
<ok to="join_download"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="download_doaj">
|
||||
<action name="download_doaj_json">
|
||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<exec>download.sh</exec>
|
||||
<argument>${doajJsonFileURL}</argument>
|
||||
<argument>${dumpPath}</argument>
|
||||
<argument>${dumpFileName}</argument>
|
||||
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
|
||||
<file>download.sh</file>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="extractTarGzAndMap"/>
|
||||
<error to="Kill"/>
|
||||
|
||||
</action>
|
||||
|
||||
<action name="extractTarGzAndMap">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV2</main-class>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.ExtractAndMapDoajJson</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--fileURL</arg><arg>${doajFileURL}</arg>
|
||||
<arg>--tmpFile</arg><arg>/tmp/doaj_replaced.csv</arg>
|
||||
<arg>--outputFile</arg><arg>${workingDir}/doaj.json</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel</arg>
|
||||
<arg>--compressedFile</arg><arg>${dumpPath}/${dumpFileName}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/DOAJ/</arg>
|
||||
<arg>--outputPath</arg><arg>/user/${wf:user()}/data/doaj.json</arg>
|
||||
</java>
|
||||
<ok to="join_download"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -125,6 +140,54 @@
|
|||
|
||||
<join name="join_download" to="produceHBM"/>
|
||||
|
||||
<action name="downloadGold">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--fileURL</arg><arg>${unibiFileURL}</arg>
|
||||
<arg>--tmpFile</arg><arg>/tmp/unibi_gold_replaced.csv</arg>
|
||||
<arg>--outputFile</arg><arg>/user/${wf:user()}/data/unibi_gold.json</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel</arg>
|
||||
</java>
|
||||
<ok to="produceHBM"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="downloadDOAJ">
|
||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapred.job.queue.name</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<exec>download.sh</exec>
|
||||
<argument>${doajJsonFileURL}</argument>
|
||||
<argument>${dumpPath}</argument>
|
||||
<argument>${dumpFileName}</argument>
|
||||
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
|
||||
<file>download.sh</file>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="extract"/>
|
||||
<error to="Kill"/>
|
||||
|
||||
</action>
|
||||
|
||||
<action name="extract">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.ExtractAndMapDoajJson</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--compressedFile</arg><arg>${dumpPath}/${dumpFileName}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/DOAJ/</arg>
|
||||
<arg>--outputPath</arg><arg>/user/${wf:user()}/data/doaj.json</arg>
|
||||
</java>
|
||||
<ok to="produceHBM"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="produceHBM">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
|
|
|
@ -2,9 +2,10 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.common.HdfsSupport
|
||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel}
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.commons.io.{FileUtils, IOUtils}
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
|
@ -13,7 +14,8 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
|||
import org.json4s.DefaultFormats
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import java.io.PrintWriter
|
||||
import java.io.{File, PrintWriter}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkProduceHostedByMap {
|
||||
|
||||
|
@ -171,7 +173,16 @@ object SparkProduceHostedByMap {
|
|||
}
|
||||
|
||||
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
|
||||
|
||||
if (doaj.getOaStart == null) {
|
||||
return getHostedByItemType(
|
||||
Constants.DOAJ,
|
||||
doaj.getJournalTitle,
|
||||
doaj.getIssn,
|
||||
doaj.getEissn,
|
||||
"",
|
||||
true
|
||||
)
|
||||
}
|
||||
return getHostedByItemType(
|
||||
Constants.DOAJ,
|
||||
doaj.getJournalTitle,
|
||||
|
@ -256,6 +267,8 @@ object SparkProduceHostedByMap {
|
|||
|
||||
logger.info("Getting the Datasources")
|
||||
|
||||
HdfsSupport.remove(outputPath, spark.sparkContext.hadoopConfiguration)
|
||||
|
||||
Aggregators
|
||||
.explodeHostedByItemType(
|
||||
oaHostedByDataset(spark, datasourcePath)
|
||||
|
|
|
@ -55,7 +55,6 @@ public class DownloadCsvTest {
|
|||
new DownloadCSV()
|
||||
.doDownload(
|
||||
fileURL,
|
||||
workingDir + "/unibi_gold",
|
||||
outputFile,
|
||||
UnibiGoldModel.class.getName(),
|
||||
',',
|
||||
|
@ -91,56 +90,6 @@ public class DownloadCsvTest {
|
|||
assertEquals(67028, count);
|
||||
}
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
void getDoajFileTest() throws CollectorException, IOException, ClassNotFoundException {
|
||||
|
||||
String fileURL = "https://doaj.org/csv";
|
||||
|
||||
final String outputFile = workingDir + "/doaj.json";
|
||||
new DownloadCSV()
|
||||
.doDownload(
|
||||
fileURL,
|
||||
workingDir + "/doaj",
|
||||
outputFile,
|
||||
DOAJModel.class.getName(),
|
||||
',',
|
||||
fs);
|
||||
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile))));
|
||||
|
||||
String line;
|
||||
int count = 0;
|
||||
while ((line = in.readLine()) != null) {
|
||||
DOAJModel doaj = new ObjectMapper().readValue(line, DOAJModel.class);
|
||||
if (count == 0) {
|
||||
assertEquals("0001-3765", doaj.getIssn());
|
||||
assertEquals("1678-2690", doaj.getEissn());
|
||||
assertEquals("Anais da Academia Brasileira de Ciências", doaj.getJournalTitle());
|
||||
}
|
||||
if (count == 22) {
|
||||
log.info(new ObjectMapper().writeValueAsString(doaj));
|
||||
System.out.println(new ObjectMapper().writeValueAsString(doaj));
|
||||
}
|
||||
if (count == 7904) {
|
||||
// log.info(new ObjectMapper().writeValueAsString(doaj));
|
||||
assertEquals("", doaj.getIssn());
|
||||
assertEquals("2055-7159", doaj.getEissn());
|
||||
assertEquals("BJR|case reports", doaj.getJournalTitle());
|
||||
}
|
||||
if (count == 16707) {
|
||||
|
||||
assertEquals("2783-1043", doaj.getIssn());
|
||||
assertEquals("2783-1051", doaj.getEissn());
|
||||
assertEquals("فیزیک کاربردی ایران", doaj.getJournalTitle());
|
||||
}
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
assertEquals(16715, count);
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanup() {
|
||||
FileUtils.deleteQuietly(new File(workingDir));
|
||||
|
|
|
@ -1,25 +1,25 @@
|
|||
{"journalTitle":"Lëd i Sneg","issn":"2076-6734","eissn":"2412-3765","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Компьютерные исследования и моделирование","issn":"2076-7633","eissn":"2077-6853","reviewProcess":"Blind peer review"}
|
||||
{"journalTitle":" Историко-биологические исследования","issn":"2076-8176","eissn":"2500-1221","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Інформаційні технології і засоби навчання","issn":"2076-8184","eissn":"","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Revue Internationale de Pédagogie de l’Enseignement Supérieur","issn":"","eissn":"2076-8427","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Проблемы развития территории","issn":"2076-8915","eissn":"2409-9007","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Rambam Maimonides Medical Journal","issn":"","eissn":"2076-9172","reviewProcess":"Peer review"}
|
||||
{"journalTitle":"Membranes","issn":"2077-0375","eissn":"","reviewProcess":"Blind peer review"}
|
||||
{"journalTitle":"Journal of Clinical Medicine","issn":"","eissn":"2077-0383","reviewProcess":"Blind peer review"}
|
||||
{"journalTitle":"Agriculture","issn":"","eissn":"2077-0472","reviewProcess":"Blind peer review"}
|
||||
{"journalTitle":"Standartnye Obrazcy","issn":"2077-1177","eissn":"","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Металл и литье Украины","issn":"2077-1304","eissn":"2706-5529","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Journal of Marine Science and Engineering","issn":"","eissn":"2077-1312","reviewProcess":"Blind peer review"}
|
||||
{"journalTitle":"Religions","issn":"","eissn":"2077-1444","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"GW-Unterricht","issn":"2077-1517","eissn":"2414-4169","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"UCV-Scientia","issn":"2077-172X","eissn":"","reviewProcess":"Peer review"}
|
||||
{"journalTitle":"Sovremennye Issledovaniâ Socialʹnyh Problem","issn":"2077-1770","eissn":"2218-7405","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Granì","issn":"2077-1800","eissn":"2413-8738","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Journal of Economics Finance and Administrative Science","issn":"2077-1886","eissn":"2218-0648","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Science Education International","issn":"","eissn":"2077-2327","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Edumecentro","issn":"","eissn":"2077-2874","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Monteverdia","issn":"","eissn":"2077-2890","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Transformación","issn":"","eissn":"2077-2955","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Journal of Space Technology","issn":"2077-3099","eissn":"2411-5029","reviewProcess":"Double blind peer review"}
|
||||
{"journalTitle":"Revue de Primatologie","issn":"","eissn":"2077-3757","reviewProcess":"Peer review"}
|
||||
{"journalTitle":"Lëd i Sneg","issn":"2076-6734","eissn":"2412-3765","reviewProcess":["Double blind peer review"],"oaStart":2015}
|
||||
{"journalTitle":"Компьютерные исследования и моделирование","issn":"2076-7633","eissn":"2077-6853","reviewProcess":["Blind peer review"],"oaStart":2009}
|
||||
{"journalTitle":" Историко-биологические исследования","issn":"2076-8176","eissn":"2500-1221","reviewProcess":["Double blind peer review"],"oaStart":2010}
|
||||
{"journalTitle":"Інформаційні технології і засоби навчання","issn":"2076-8184","eissn":null,"reviewProcess":["Double blind peer review"],"oaStart":2006}
|
||||
{"journalTitle":"Revue Internationale de Pédagogie de l’Enseignement Supérieur","issn":null,"eissn":"2076-8427","reviewProcess":["Double blind peer review"],"oaStart":2009}
|
||||
{"journalTitle":"Проблемы развития территории","issn":"2076-8915","eissn":"2409-9007","reviewProcess":["Double blind peer review"],"oaStart":2008}
|
||||
{"journalTitle":"Rambam Maimonides Medical Journal","issn":null,"eissn":"2076-9172","reviewProcess":["Peer review"],"oaStart":2010}
|
||||
{"journalTitle":"Membranes","issn":"2077-0375","eissn":null,"reviewProcess":["Blind peer review"],"oaStart":2011}
|
||||
{"journalTitle":"Journal of Clinical Medicine","issn":null,"eissn":"2077-0383","reviewProcess":["Blind peer review"],"oaStart":2012}
|
||||
{"journalTitle":"Agriculture","issn":null,"eissn":"2077-0472","reviewProcess":["Blind peer review"],"oaStart":2011}
|
||||
{"journalTitle":"Standartnye Obrazcy","issn":"2077-1177","eissn":null,"reviewProcess":["Double blind peer review"],"oaStart":2014}
|
||||
{"journalTitle":"Металл и литье Украины","issn":"2077-1304","eissn":"2706-5529","reviewProcess":["Double blind peer review"],"oaStart":2019}
|
||||
{"journalTitle":"Journal of Marine Science and Engineering","issn":null,"eissn":"2077-1312","reviewProcess":["Blind peer review"],"oaStart":2013}
|
||||
{"journalTitle":"Religions","issn":null,"eissn":"2077-1444","reviewProcess":["Double blind peer review"],"oaStart":2010}
|
||||
{"journalTitle":"GW-Unterricht","issn":"2077-1517","eissn":"2414-4169","reviewProcess":["Double blind peer review"],"oaStart":2010}
|
||||
{"journalTitle":"UCV-Scientia","issn":"2077-172X","eissn":null,"reviewProcess":["Peer review"],"oaStart":2009}
|
||||
{"journalTitle":"Sovremennye Issledovaniâ Socialʹnyh Problem","issn":"2077-1770","eissn":"2218-7405","reviewProcess":["Double blind peer review"],"oaStart":2010}
|
||||
{"journalTitle":"Granì","issn":"2077-1800","eissn":"2413-8738","reviewProcess":["Double blind peer review"],"oaStart":2014}
|
||||
{"journalTitle":"Journal of Economics Finance and Administrative Science","issn":"2077-1886","eissn":"2218-0648","reviewProcess":["Double blind peer review"],"oaStart":2017}
|
||||
{"journalTitle":"Science Education International","issn":null,"eissn":"2077-2327","reviewProcess":["Double blind peer review"],"oaStart":2017}
|
||||
{"journalTitle":"Edumecentro","issn":null,"eissn":"2077-2874","reviewProcess":["Double blind peer review"],"oaStart":2013}
|
||||
{"journalTitle":"Monteverdia","issn":null,"eissn":"2077-2890","reviewProcess":["Double blind peer review"],"oaStart":2008}
|
||||
{"journalTitle":"Transformación","issn":null,"eissn":"2077-2955","reviewProcess":["Double blind peer review"],"oaStart":2010}
|
||||
{"journalTitle":"Journal of Space Technology","issn":"2077-3099","eissn":"2411-5029","reviewProcess":["Double blind peer review"],"oaStart":2011}
|
||||
{"journalTitle":"Revue de Primatologie","issn":null,"eissn":"2077-3757","reviewProcess":["Peer review"],"oaStart":2009}
|
Loading…
Reference in New Issue