[HostedByMap] download of doaj from json, modification of test resources, deletion of class no more needed for the CSV download

This commit is contained in:
Miriam Baglioni 2022-03-04 15:18:21 +01:00
parent 5d608d6291
commit 2c5087d55a
7 changed files with 129 additions and 236 deletions

View File

@ -23,7 +23,7 @@ public class DownloadCSV {
private static final Logger log = LoggerFactory.getLogger(DownloadCSV.class);
public static final char DEFAULT_DELIMITER = ';';
public static final char DEFAULT_DELIMITER = ',';
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -40,9 +40,6 @@ public class DownloadCSV {
final String fileURL = parser.get("fileURL");
log.info("fileURL {}", fileURL);
final String workingPath = parser.get("workingPath");
log.info("workingPath {}", workingPath);
final String outputFile = parser.get("outputFile");
log.info("outputFile {}", outputFile);
@ -63,31 +60,15 @@ public class DownloadCSV {
FileSystem fileSystem = FileSystem.get(conf);
new DownloadCSV().doDownload(fileURL, workingPath, outputFile, classForName, delimiter, fileSystem);
new DownloadCSV().doDownload(fileURL, outputFile, classForName, delimiter, fileSystem);
}
protected void doDownload(String fileURL, String workingPath, String outputFile, String classForName,
protected void doDownload(String fileURL, String outputFile, String classForName,
char delimiter, FileSystem fs)
throws IOException, ClassNotFoundException, CollectorException {
final HttpConnector2 connector2 = new HttpConnector2();
final Path path = new Path(workingPath + "/replaced.csv");
try (BufferedReader in = new BufferedReader(
new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) {
try (PrintWriter writer = new PrintWriter(
new OutputStreamWriter(fs.create(path, true), StandardCharsets.UTF_8))) {
String line;
while ((line = in.readLine()) != null) {
writer.println(line.replace("\\\"", "\""));
}
}
}
try (InputStreamReader reader = new InputStreamReader(fs.open(path))) {
try (InputStreamReader reader = new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))) {
GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter);
}
}

View File

@ -1,84 +0,0 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap;
import java.io.*;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.GetCSV;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class DownloadCSV2 {
private static final Logger log = LoggerFactory.getLogger(DownloadCSV2.class);
public static final char DEFAULT_DELIMITER = ';';
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
DownloadCSV2.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/hostedbymap/download_csv_parameters.json"))));
parser.parseArgument(args);
final String fileURL = parser.get("fileURL");
log.info("fileURL {}", fileURL);
final String tmpFile = parser.get("tmpFile");
log.info("tmpFile {}", tmpFile);
final String outputFile = parser.get("outputFile");
log.info("outputFile {}", outputFile);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
final String classForName = parser.get("classForName");
log.info("classForName {}", classForName);
final char delimiter = Optional
.ofNullable(parser.get("delimiter"))
.map(s -> s.charAt(0))
.orElse(DEFAULT_DELIMITER);
log.info("delimiter {}", delimiter);
HttpConnector2 connector2 = new HttpConnector2();
try (BufferedReader in = new BufferedReader(
new InputStreamReader(connector2.getInputSourceAsStream(fileURL)))) {
try (PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(tmpFile)))) {
String line;
while ((line = in.readLine()) != null) {
writer.println(line.replace("\\\"", "\""));
}
}
}
try (BufferedReader in = new BufferedReader(new FileReader(tmpFile))) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
GetCSV.getCsv(fileSystem, in, outputFile, classForName, delimiter);
} finally {
FileUtils.deleteQuietly(new File(tmpFile));
}
}
}

View File

@ -9,7 +9,6 @@ import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Objects;
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
@ -24,6 +23,7 @@ import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel;
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.doaj.DOAJEntry;
public class ExtractAndMapDoajJson {

View File

@ -74,7 +74,9 @@
<decision name="resume_from">
<switch>
<case to="produceHBM">${wf:conf('resumeFrom') eq 'ProduceHBM'}</case>
<case to="remove_hbmpath">${wf:conf('resumeFrom') eq 'download_csv'}</case>
<case to="fork_downloads_csv">${wf:conf('resumeFrom') eq 'DownloadBoth'}</case>
<case to="downloadGold">${wf:conf('resumeFrom') eq 'DownloadGold'}</case>
<case to="downloadDOAJ">${wf:conf('resumeFrom') eq 'DownloadDoaj'}</case>
<default to="prepareInfo"/>
</switch>
</decision>
@ -83,18 +85,9 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="remove_hbmpath">
<fs>
<delete path="${hostedByMapPath}"/>
<!-- <mkdir path="${hostedByMapPath}"/>-->
</fs>
<ok to="fork_downloads_csv"/>
<error to="Kill"/>
</action>
<fork name="fork_downloads_csv">
<fork name="fork_downloads_csv">
<path start="download_gold"/>
<path start="download_doaj"/>
<path start="download_doaj_json"/>
</fork>
<action name="download_gold">
@ -103,21 +96,43 @@
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${unibiFileURL}</arg>
<arg>--tmpFile</arg><arg>/tmp/unibi_gold_replaced.csv</arg>
<arg>--outputFile</arg><arg>${workingDir}/unibi_gold.json</arg>
<arg>--outputFile</arg><arg>/user/${wf:user()}/data/unibi_gold.json</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel</arg>
</java>
<ok to="join_download"/>
<error to="Kill"/>
</action>
<action name="download_doaj">
<action name="download_doaj_json">
<shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<exec>download.sh</exec>
<argument>${doajJsonFileURL}</argument>
<argument>${dumpPath}</argument>
<argument>${dumpFileName}</argument>
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
<file>download.sh</file>
<capture-output/>
</shell>
<ok to="extractTarGzAndMap"/>
<error to="Kill"/>
</action>
<action name="extractTarGzAndMap">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV2</main-class>
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.ExtractAndMapDoajJson</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${doajFileURL}</arg>
<arg>--tmpFile</arg><arg>/tmp/doaj_replaced.csv</arg>
<arg>--outputFile</arg><arg>${workingDir}/doaj.json</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.DOAJModel</arg>
<arg>--compressedFile</arg><arg>${dumpPath}/${dumpFileName}</arg>
<arg>--workingPath</arg><arg>${workingDir}/DOAJ/</arg>
<arg>--outputPath</arg><arg>/user/${wf:user()}/data/doaj.json</arg>
</java>
<ok to="join_download"/>
<error to="Kill"/>
@ -125,6 +140,54 @@
<join name="join_download" to="produceHBM"/>
<action name="downloadGold">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.DownloadCSV</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${unibiFileURL}</arg>
<arg>--tmpFile</arg><arg>/tmp/unibi_gold_replaced.csv</arg>
<arg>--outputFile</arg><arg>/user/${wf:user()}/data/unibi_gold.json</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.oa.graph.hostedbymap.model.UnibiGoldModel</arg>
</java>
<ok to="produceHBM"/>
<error to="Kill"/>
</action>
<action name="downloadDOAJ">
<shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<exec>download.sh</exec>
<argument>${doajJsonFileURL}</argument>
<argument>${dumpPath}</argument>
<argument>${dumpFileName}</argument>
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
<file>download.sh</file>
<capture-output/>
</shell>
<ok to="extract"/>
<error to="Kill"/>
</action>
<action name="extract">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.hostedbymap.ExtractAndMapDoajJson</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--compressedFile</arg><arg>${dumpPath}/${dumpFileName}</arg>
<arg>--workingPath</arg><arg>${workingDir}/DOAJ/</arg>
<arg>--outputPath</arg><arg>/user/${wf:user()}/data/doaj.json</arg>
</java>
<ok to="produceHBM"/>
<error to="Kill"/>
</action>
<action name="produceHBM">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>

View File

@ -36,9 +36,7 @@ object SparkProduceHostedByMap {
openaire.journal_id,
"",
"",
isOpenAccess,
-1,
List[String]()
isOpenAccess
)
case Constants.EISSN =>
HostedByItemType(
@ -47,9 +45,7 @@ object SparkProduceHostedByMap {
"",
openaire.journal_id,
"",
isOpenAccess,
-1,
List[String]()
isOpenAccess
)
case Constants.ISSNL =>
HostedByItemType(
@ -58,9 +54,7 @@ object SparkProduceHostedByMap {
"",
"",
openaire.journal_id,
isOpenAccess,
-1,
List[String]()
isOpenAccess
)
// catch the default with a variable so you can print it
@ -85,36 +79,34 @@ object SparkProduceHostedByMap {
issn: String,
eissn: String,
issnl: String,
oa: Boolean,
oaDate: Int,
reviewProcess: List[String]
oa: Boolean
): HostedByItemType = {
if (issn != null) {
if (eissn != null) {
if (issnl != null) {
HostedByItemType(id, officialname, issn, eissn, issnl, oa, oaDate, reviewProcess)
HostedByItemType(id, officialname, issn, eissn, issnl, oa)
} else {
HostedByItemType(id, officialname, issn, eissn, "", oa, oaDate, reviewProcess)
HostedByItemType(id, officialname, issn, eissn, "", oa)
}
} else {
if (issnl != null) {
HostedByItemType(id, officialname, issn, "", issnl, oa, oaDate, reviewProcess)
HostedByItemType(id, officialname, issn, "", issnl, oa)
} else {
HostedByItemType(id, officialname, issn, "", "", oa, oaDate, reviewProcess)
HostedByItemType(id, officialname, issn, "", "", oa)
}
}
} else {
if (eissn != null) {
if (issnl != null) {
HostedByItemType(id, officialname, "", eissn, issnl, oa, oaDate, reviewProcess)
HostedByItemType(id, officialname, "", eissn, issnl, oa)
} else {
HostedByItemType(id, officialname, "", eissn, "", oa, oaDate, reviewProcess)
HostedByItemType(id, officialname, "", eissn, "", oa)
}
} else {
if (issnl != null) {
HostedByItemType(id, officialname, "", "", issnl, oa, oaDate, reviewProcess)
HostedByItemType(id, officialname, "", "", issnl, oa)
} else {
HostedByItemType("", "", "", "", "", oa, oaDate, reviewProcess)
HostedByItemType("", "", "", "", "", oa)
}
}
}
@ -129,12 +121,10 @@ object SparkProduceHostedByMap {
dats.getJournal.getIssnPrinted,
dats.getJournal.getIssnOnline,
dats.getJournal.getIssnLinking,
false,
-1,
List[String]()
false
)
}
HostedByItemType("", "", "", "", "", false, -1, List[String]())
HostedByItemType("", "", "", "", "", false)
}
def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
@ -160,9 +150,7 @@ object SparkProduceHostedByMap {
gold.getIssn,
"",
gold.getIssnL,
true,
-1,
List[String]()
true
)
}
@ -192,9 +180,7 @@ object SparkProduceHostedByMap {
doaj.getIssn,
doaj.getEissn,
"",
true,
-1,
doaj.getReviewProcess.asScala.toList
true
)
}
return getHostedByItemType(
@ -203,9 +189,7 @@ object SparkProduceHostedByMap {
doaj.getIssn,
doaj.getEissn,
"",
true,
doaj.getOaStart,
doaj.getReviewProcess.asScala.toList
true
)
}

View File

@ -55,7 +55,6 @@ public class DownloadCsvTest {
new DownloadCSV()
.doDownload(
fileURL,
workingDir + "/unibi_gold",
outputFile,
UnibiGoldModel.class.getName(),
',',
@ -91,56 +90,6 @@ public class DownloadCsvTest {
assertEquals(67028, count);
}
@Disabled
@Test
void getDoajFileTest() throws CollectorException, IOException, ClassNotFoundException {
String fileURL = "https://doaj.org/csv";
final String outputFile = workingDir + "/doaj.json";
new DownloadCSV()
.doDownload(
fileURL,
workingDir + "/doaj",
outputFile,
DOAJModel.class.getName(),
',',
fs);
BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(new Path(outputFile))));
String line;
int count = 0;
while ((line = in.readLine()) != null) {
DOAJModel doaj = new ObjectMapper().readValue(line, DOAJModel.class);
if (count == 0) {
assertEquals("0001-3765", doaj.getIssn());
assertEquals("1678-2690", doaj.getEissn());
assertEquals("Anais da Academia Brasileira de Ciências", doaj.getJournalTitle());
}
if (count == 22) {
log.info(new ObjectMapper().writeValueAsString(doaj));
System.out.println(new ObjectMapper().writeValueAsString(doaj));
}
if (count == 7904) {
// log.info(new ObjectMapper().writeValueAsString(doaj));
assertEquals("", doaj.getIssn());
assertEquals("2055-7159", doaj.getEissn());
assertEquals("BJR|case reports", doaj.getJournalTitle());
}
if (count == 16707) {
assertEquals("2783-1043", doaj.getIssn());
assertEquals("2783-1051", doaj.getEissn());
assertEquals("فیزیک کاربردی ایران", doaj.getJournalTitle());
}
count += 1;
}
assertEquals(16715, count);
}
@AfterAll
public static void cleanup() {
FileUtils.deleteQuietly(new File(workingDir));

View File

@ -1,25 +1,25 @@
{"journalTitle":"Lëd i Sneg","issn":"2076-6734","eissn":"2412-3765","reviewProcess":"Double blind peer review"}
{"journalTitle":"Компьютерные исследования и моделирование","issn":"2076-7633","eissn":"2077-6853","reviewProcess":"Blind peer review"}
{"journalTitle":" Историко-биологические исследования","issn":"2076-8176","eissn":"2500-1221","reviewProcess":"Double blind peer review"}
{"journalTitle":"Інформаційні технології і засоби навчання","issn":"2076-8184","eissn":"","reviewProcess":"Double blind peer review"}
{"journalTitle":"Revue Internationale de Pédagogie de lEnseignement Supérieur","issn":"","eissn":"2076-8427","reviewProcess":"Double blind peer review"}
{"journalTitle":"Проблемы развития территории","issn":"2076-8915","eissn":"2409-9007","reviewProcess":"Double blind peer review"}
{"journalTitle":"Rambam Maimonides Medical Journal","issn":"","eissn":"2076-9172","reviewProcess":"Peer review"}
{"journalTitle":"Membranes","issn":"2077-0375","eissn":"","reviewProcess":"Blind peer review"}
{"journalTitle":"Journal of Clinical Medicine","issn":"","eissn":"2077-0383","reviewProcess":"Blind peer review"}
{"journalTitle":"Agriculture","issn":"","eissn":"2077-0472","reviewProcess":"Blind peer review"}
{"journalTitle":"Standartnye Obrazcy","issn":"2077-1177","eissn":"","reviewProcess":"Double blind peer review"}
{"journalTitle":"Металл и литье Украины","issn":"2077-1304","eissn":"2706-5529","reviewProcess":"Double blind peer review"}
{"journalTitle":"Journal of Marine Science and Engineering","issn":"","eissn":"2077-1312","reviewProcess":"Blind peer review"}
{"journalTitle":"Religions","issn":"","eissn":"2077-1444","reviewProcess":"Double blind peer review"}
{"journalTitle":"GW-Unterricht","issn":"2077-1517","eissn":"2414-4169","reviewProcess":"Double blind peer review"}
{"journalTitle":"UCV-Scientia","issn":"2077-172X","eissn":"","reviewProcess":"Peer review"}
{"journalTitle":"Sovremennye Issledovaniâ Socialʹnyh Problem","issn":"2077-1770","eissn":"2218-7405","reviewProcess":"Double blind peer review"}
{"journalTitle":"Granì","issn":"2077-1800","eissn":"2413-8738","reviewProcess":"Double blind peer review"}
{"journalTitle":"Journal of Economics Finance and Administrative Science","issn":"2077-1886","eissn":"2218-0648","reviewProcess":"Double blind peer review"}
{"journalTitle":"Science Education International","issn":"","eissn":"2077-2327","reviewProcess":"Double blind peer review"}
{"journalTitle":"Edumecentro","issn":"","eissn":"2077-2874","reviewProcess":"Double blind peer review"}
{"journalTitle":"Monteverdia","issn":"","eissn":"2077-2890","reviewProcess":"Double blind peer review"}
{"journalTitle":"Transformación","issn":"","eissn":"2077-2955","reviewProcess":"Double blind peer review"}
{"journalTitle":"Journal of Space Technology","issn":"2077-3099","eissn":"2411-5029","reviewProcess":"Double blind peer review"}
{"journalTitle":"Revue de Primatologie","issn":"","eissn":"2077-3757","reviewProcess":"Peer review"}
{"journalTitle":"Lëd i Sneg","issn":"2076-6734","eissn":"2412-3765","reviewProcess":["Double blind peer review"],"oaStart":2015}
{"journalTitle":"Компьютерные исследования и моделирование","issn":"2076-7633","eissn":"2077-6853","reviewProcess":["Blind peer review"],"oaStart":2009}
{"journalTitle":" Историко-биологические исследования","issn":"2076-8176","eissn":"2500-1221","reviewProcess":["Double blind peer review"],"oaStart":2010}
{"journalTitle":"Інформаційні технології і засоби навчання","issn":"2076-8184","eissn":null,"reviewProcess":["Double blind peer review"],"oaStart":2006}
{"journalTitle":"Revue Internationale de Pédagogie de lEnseignement Supérieur","issn":null,"eissn":"2076-8427","reviewProcess":["Double blind peer review"],"oaStart":2009}
{"journalTitle":"Проблемы развития территории","issn":"2076-8915","eissn":"2409-9007","reviewProcess":["Double blind peer review"],"oaStart":2008}
{"journalTitle":"Rambam Maimonides Medical Journal","issn":null,"eissn":"2076-9172","reviewProcess":["Peer review"],"oaStart":2010}
{"journalTitle":"Membranes","issn":"2077-0375","eissn":null,"reviewProcess":["Blind peer review"],"oaStart":2011}
{"journalTitle":"Journal of Clinical Medicine","issn":null,"eissn":"2077-0383","reviewProcess":["Blind peer review"],"oaStart":2012}
{"journalTitle":"Agriculture","issn":null,"eissn":"2077-0472","reviewProcess":["Blind peer review"],"oaStart":2011}
{"journalTitle":"Standartnye Obrazcy","issn":"2077-1177","eissn":null,"reviewProcess":["Double blind peer review"],"oaStart":2014}
{"journalTitle":"Металл и литье Украины","issn":"2077-1304","eissn":"2706-5529","reviewProcess":["Double blind peer review"],"oaStart":2019}
{"journalTitle":"Journal of Marine Science and Engineering","issn":null,"eissn":"2077-1312","reviewProcess":["Blind peer review"],"oaStart":2013}
{"journalTitle":"Religions","issn":null,"eissn":"2077-1444","reviewProcess":["Double blind peer review"],"oaStart":2010}
{"journalTitle":"GW-Unterricht","issn":"2077-1517","eissn":"2414-4169","reviewProcess":["Double blind peer review"],"oaStart":2010}
{"journalTitle":"UCV-Scientia","issn":"2077-172X","eissn":null,"reviewProcess":["Peer review"],"oaStart":2009}
{"journalTitle":"Sovremennye Issledovaniâ Socialʹnyh Problem","issn":"2077-1770","eissn":"2218-7405","reviewProcess":["Double blind peer review"],"oaStart":2010}
{"journalTitle":"Granì","issn":"2077-1800","eissn":"2413-8738","reviewProcess":["Double blind peer review"],"oaStart":2014}
{"journalTitle":"Journal of Economics Finance and Administrative Science","issn":"2077-1886","eissn":"2218-0648","reviewProcess":["Double blind peer review"],"oaStart":2017}
{"journalTitle":"Science Education International","issn":null,"eissn":"2077-2327","reviewProcess":["Double blind peer review"],"oaStart":2017}
{"journalTitle":"Edumecentro","issn":null,"eissn":"2077-2874","reviewProcess":["Double blind peer review"],"oaStart":2013}
{"journalTitle":"Monteverdia","issn":null,"eissn":"2077-2890","reviewProcess":["Double blind peer review"],"oaStart":2008}
{"journalTitle":"Transformación","issn":null,"eissn":"2077-2955","reviewProcess":["Double blind peer review"],"oaStart":2010}
{"journalTitle":"Journal of Space Technology","issn":"2077-3099","eissn":"2411-5029","reviewProcess":["Double blind peer review"],"oaStart":2011}
{"journalTitle":"Revue de Primatologie","issn":null,"eissn":"2077-3757","reviewProcess":["Peer review"],"oaStart":2009}