forked from D-Net/dnet-hadoop
removed all comments in Italian
This commit is contained in:
parent
52c18c2697
commit
b1c6140ebf
|
@ -61,8 +61,6 @@ object SparkApplyHostedByMapToDatasource {
|
||||||
val pinfo : Dataset[EntityInfo] = Aggregators.datasourceToSingleId( spark.read.textFile(preparedInfoPath)
|
val pinfo : Dataset[EntityInfo] = Aggregators.datasourceToSingleId( spark.read.textFile(preparedInfoPath)
|
||||||
.map(ei => mapper.readValue(ei, classOf[EntityInfo])))
|
.map(ei => mapper.readValue(ei, classOf[EntityInfo])))
|
||||||
|
|
||||||
//c. dataset join risultato del passo prima di a per datasource id, gruppo per ds id e cambio compatibilita' se necessario
|
|
||||||
|
|
||||||
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
|
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,9 +14,6 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
//a. publication join risultato del passo precedente su result id (left) setto la istanza (se piu' di una instance
|
|
||||||
// nel result => salto)con l'hosted by anche access right della instance se openaccess e' true
|
|
||||||
|
|
||||||
|
|
||||||
object SparkApplyHostedByMapToResult {
|
object SparkApplyHostedByMapToResult {
|
||||||
|
|
||||||
|
@ -76,8 +73,6 @@ object SparkApplyHostedByMapToResult {
|
||||||
val pinfo : Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
|
val pinfo : Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
|
||||||
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
|
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
|
||||||
|
|
||||||
//a. publication join risultato del passo precedente su result id (left) setto la istanza (se piu' di una instance
|
|
||||||
// nel result => salto)con l'hosted by anche access right della instance se openaccess e' true
|
|
||||||
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
|
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -106,14 +106,15 @@ object SparkPrepareHostedByInfoToApply {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
//STEP1: leggere la hostedbymap e trasformarla in entity info
|
//STEP1: read the hostedbymap and transform it in EntityInfo
|
||||||
val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
|
val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
|
||||||
|
|
||||||
//STEP2: creare la mappa publication id issn, eissn, lissn esplosa
|
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
|
||||||
val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
|
val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
|
||||||
|
|
||||||
//STEP3: join resultInfo con hostedByInfo sul journal_id dal result con left
|
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
|
||||||
// e riduzione di tutti i result con lo stesso id in una unica entry con aggiunto l'id della datasource
|
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)
|
||||||
|
//to this entry we add the id of the datasource for the next step
|
||||||
joinResHBM(resultInfoDataset, hostedByInfo)
|
joinResHBM(resultInfoDataset, hostedByInfo)
|
||||||
.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
.write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue