Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
2 changed files with 62 additions and 22 deletions
Showing only changes of commit 5d608d6291 - Show all commits

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap.model;
import java.io.Serializable;
import java.util.List;
import com.opencsv.bean.CsvBindByName;
@ -17,7 +18,17 @@ public class DOAJModel implements Serializable {
private String eissn;
@CsvBindByName(column = "Review process")
private String reviewProcess;
private List<String> reviewProcess;
private Integer oaStart;
public Integer getOaStart() {
return oaStart;
}
public void setOaStart(Integer oaStart) {
this.oaStart = oaStart;
}
public String getJournalTitle() {
return journalTitle;
@ -43,11 +54,11 @@ public class DOAJModel implements Serializable {
this.eissn = eissn;
}
public String getReviewProcess() {
public List<String> getReviewProcess() {
return reviewProcess;
}
public void setReviewProcess(String reviewProcess) {
public void setReviewProcess(List<String> reviewProcess) {
this.reviewProcess = reviewProcess;
}
}

View File

@ -2,9 +2,10 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel}
import eu.dnetlib.dhp.schema.oaf.Datasource
import org.apache.commons.io.IOUtils
import org.apache.commons.io.{FileUtils, IOUtils}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
@ -13,7 +14,8 @@ import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory}
import java.io.PrintWriter
import java.io.{File, PrintWriter}
import scala.collection.JavaConverters._
object SparkProduceHostedByMap {
@ -34,7 +36,9 @@ object SparkProduceHostedByMap {
openaire.journal_id,
"",
"",
isOpenAccess
isOpenAccess,
-1,
List[String]()
)
case Constants.EISSN =>
HostedByItemType(
@ -43,7 +47,9 @@ object SparkProduceHostedByMap {
"",
openaire.journal_id,
"",
isOpenAccess
isOpenAccess,
-1,
List[String]()
)
case Constants.ISSNL =>
HostedByItemType(
@ -52,7 +58,9 @@ object SparkProduceHostedByMap {
"",
"",
openaire.journal_id,
isOpenAccess
isOpenAccess,
-1,
List[String]()
)
// catch the default with a variable so you can print it
@ -77,34 +85,36 @@ object SparkProduceHostedByMap {
issn: String,
eissn: String,
issnl: String,
oa: Boolean
oa: Boolean,
oaDate: Int,
reviewProcess: List[String]
): HostedByItemType = {
if (issn != null) {
if (eissn != null) {
if (issnl != null) {
HostedByItemType(id, officialname, issn, eissn, issnl, oa)
HostedByItemType(id, officialname, issn, eissn, issnl, oa, oaDate, reviewProcess)
} else {
HostedByItemType(id, officialname, issn, eissn, "", oa)
HostedByItemType(id, officialname, issn, eissn, "", oa, oaDate, reviewProcess)
}
} else {
if (issnl != null) {
HostedByItemType(id, officialname, issn, "", issnl, oa)
HostedByItemType(id, officialname, issn, "", issnl, oa, oaDate, reviewProcess)
} else {
HostedByItemType(id, officialname, issn, "", "", oa)
HostedByItemType(id, officialname, issn, "", "", oa, oaDate, reviewProcess)
}
}
} else {
if (eissn != null) {
if (issnl != null) {
HostedByItemType(id, officialname, "", eissn, issnl, oa)
HostedByItemType(id, officialname, "", eissn, issnl, oa, oaDate, reviewProcess)
} else {
HostedByItemType(id, officialname, "", eissn, "", oa)
HostedByItemType(id, officialname, "", eissn, "", oa, oaDate, reviewProcess)
}
} else {
if (issnl != null) {
HostedByItemType(id, officialname, "", "", issnl, oa)
HostedByItemType(id, officialname, "", "", issnl, oa, oaDate, reviewProcess)
} else {
HostedByItemType("", "", "", "", "", oa)
HostedByItemType("", "", "", "", "", oa, oaDate, reviewProcess)
}
}
}
@ -119,10 +129,12 @@ object SparkProduceHostedByMap {
dats.getJournal.getIssnPrinted,
dats.getJournal.getIssnOnline,
dats.getJournal.getIssnLinking,
false
false,
-1,
List[String]()
)
}
HostedByItemType("", "", "", "", "", false)
HostedByItemType("", "", "", "", "", false, -1, List[String]())
}
def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
@ -148,7 +160,9 @@ object SparkProduceHostedByMap {
gold.getIssn,
"",
gold.getIssnL,
true
true,
-1,
List[String]()
)
}
@ -171,14 +185,27 @@ object SparkProduceHostedByMap {
}
def doajToHostedbyItemType(doaj: DOAJModel): HostedByItemType = {
if (doaj.getOaStart == null) {
return getHostedByItemType(
Constants.DOAJ,
doaj.getJournalTitle,
doaj.getIssn,
doaj.getEissn,
"",
true
true,
-1,
doaj.getReviewProcess.asScala.toList
)
}
return getHostedByItemType(
Constants.DOAJ,
doaj.getJournalTitle,
doaj.getIssn,
doaj.getEissn,
"",
true,
doaj.getOaStart,
doaj.getReviewProcess.asScala.toList
)
}
@ -256,6 +283,8 @@ object SparkProduceHostedByMap {
logger.info("Getting the Datasources")
HdfsSupport.remove(outputPath, spark.sparkContext.hadoopConfiguration)
Aggregators
.explodeHostedByItemType(
oaHostedByDataset(spark, datasourcePath)