Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
2 changed files with 62 additions and 22 deletions
Showing only changes of commit 5d608d6291 - Show all commits

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap.model; package eu.dnetlib.dhp.oa.graph.hostedbymap.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.List;
import com.opencsv.bean.CsvBindByName; import com.opencsv.bean.CsvBindByName;
@ -17,7 +18,17 @@ public class DOAJModel implements Serializable {
private String eissn; private String eissn;
@CsvBindByName(column = "Review process") @CsvBindByName(column = "Review process")
private String reviewProcess; private List<String> reviewProcess;
private Integer oaStart;
public Integer getOaStart() {
return oaStart;
}
public void setOaStart(Integer oaStart) {
this.oaStart = oaStart;
}
public String getJournalTitle() { public String getJournalTitle() {
return journalTitle; return journalTitle;
@ -43,11 +54,11 @@ public class DOAJModel implements Serializable {
this.eissn = eissn; this.eissn = eissn;
} }
public String getReviewProcess() { public List<String> getReviewProcess() {
return reviewProcess; return reviewProcess;
} }
public void setReviewProcess(String reviewProcess) { public void setReviewProcess(List<String> reviewProcess) {
this.reviewProcess = reviewProcess; this.reviewProcess = reviewProcess;
} }
} }

View File

@ -2,9 +2,10 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel}
import eu.dnetlib.dhp.schema.oaf.Datasource import eu.dnetlib.dhp.schema.oaf.Datasource
import org.apache.commons.io.IOUtils import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.io.compress.GzipCodec
@ -13,7 +14,8 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.io.PrintWriter import java.io.{File, PrintWriter}
import scala.collection.JavaConverters._
object SparkProduceHostedByMap { object SparkProduceHostedByMap {
@ -34,7 +36,9 @@ object SparkProduceHostedByMap {
openaire.journal_id, openaire.journal_id,
"", "",
"", "",
isOpenAccess isOpenAccess,
-1,
List[String]()
) )
case Constants.EISSN => case Constants.EISSN =>
HostedByItemType( HostedByItemType(
@ -43,7 +47,9 @@ object SparkProduceHostedByMap {
"", "",
openaire.journal_id, openaire.journal_id,
"", "",
isOpenAccess isOpenAccess,
-1,
List[String]()
) )
case Constants.ISSNL => case Constants.ISSNL =>
HostedByItemType( HostedByItemType(
@ -52,7 +58,9 @@ object SparkProduceHostedByMap {
"", "",
"", "",
openaire.journal_id, openaire.journal_id,
isOpenAccess isOpenAccess,
-1,
List[String]()
) )
// catch the default with a variable so you can print it // catch the default with a variable so you can print it
@ -77,34 +85,36 @@ object SparkProduceHostedByMap {
issn: String, issn: String,
eissn: String, eissn: String,
issnl: String, issnl: String,
oa: Boolean oa: Boolean,
oaDate: Int,
reviewProcess: List[String]
): HostedByItemType = { ): HostedByItemType = {
if (issn != null) { if (issn != null) {
if (eissn != null) { if (eissn != null) {
if (issnl != null) { if (issnl != null) {
HostedByItemType(id, officialname, issn, eissn, issnl, oa) HostedByItemType(id, officialname, issn, eissn, issnl, oa, oaDate, reviewProcess)
} else { } else {
HostedByItemType(id, officialname, issn, eissn, "", oa) HostedByItemType(id, officialname, issn, eissn, "", oa, oaDate, reviewProcess)
} }
} else { } else {
if (issnl != null) { if (issnl != null) {
HostedByItemType(id, officialname, issn, "", issnl, oa) HostedByItemType(id, officialname, issn, "", issnl, oa, oaDate, reviewProcess)
} else { } else {
HostedByItemType(id, officialname, issn, "", "", oa) HostedByItemType(id, officialname, issn, "", "", oa, oaDate, reviewProcess)
} }
} }
} else { } else {
if (eissn != null) { if (eissn != null) {
if (issnl != null) { if (issnl != null) {
HostedByItemType(id, officialname, "", eissn, issnl, oa) HostedByItemType(id, officialname, "", eissn, issnl, oa, oaDate, reviewProcess)
} else { } else {
HostedByItemType(id, officialname, "", eissn, "", oa) HostedByItemType(id, officialname, "", eissn, "", oa, oaDate, reviewProcess)
} }
} else { } else {
if (issnl != null) { if (issnl != null) {
HostedByItemType(id, officialname, "", "", issnl, oa) HostedByItemType(id, officialname, "", "", issnl, oa, oaDate, reviewProcess)
} else { } else {
HostedByItemType("", "", "", "", "", oa) HostedByItemType("", "", "", "", "", oa, oaDate, reviewProcess)
} }
} }
} }
@ -119,10 +129,12 @@ object SparkProduceHostedByMap {
dats.getJournal.getIssnPrinted, dats.getJournal.getIssnPrinted,
dats.getJournal.getIssnOnline, dats.getJournal.getIssnOnline,
dats.getJournal.getIssnLinking, dats.getJournal.getIssnLinking,
false false,
-1,
List[String]()
) )
} }
HostedByItemType("", "", "", "", "", false) HostedByItemType("", "", "", "", "", false, -1, List[String]())
} }
def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = { def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
@ -148,7 +160,9 @@ object SparkProduceHostedByMap {
gold.getIssn, gold.getIssn,
"", "",
gold.getIssnL, gold.getIssnL,
true true,
-1,
List[String]()
) )
} }
@ -171,14 +185,27 @@ object SparkProduceHostedByMap {
} }
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = { def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
if (doaj.getOaStart == null) {
return getHostedByItemType( return getHostedByItemType(
Constants.DOAJ, Constants.DOAJ,
doaj.getJournalTitle, doaj.getJournalTitle,
doaj.getIssn, doaj.getIssn,
doaj.getEissn, doaj.getEissn,
"", "",
true true,
-1,
doaj.getReviewProcess.asScala.toList
)
}
return getHostedByItemType(
Constants.DOAJ,
doaj.getJournalTitle,
doaj.getIssn,
doaj.getEissn,
"",
true,
doaj.getOaStart,
doaj.getReviewProcess.asScala.toList
) )
} }
@ -256,6 +283,8 @@ object SparkProduceHostedByMap {
logger.info("Getting the Datasources") logger.info("Getting the Datasources")
HdfsSupport.remove(outputPath, spark.sparkContext.hadoopConfiguration)
Aggregators Aggregators
.explodeHostedByItemType( .explodeHostedByItemType(
oaHostedByDataset(spark, datasourcePath) oaHostedByDataset(spark, datasourcePath)