forked from D-Net/dnet-hadoop
code refactor, created and moved scala code on the correct maven folder under src/main/scala and src/test/scala
This commit is contained in:
parent
2d67020c59
commit
cded363b55
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.actionmanager.scholix
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
|
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, Oaf, Publication, Software, OtherResearchProduct, Relation}
|
||||||
import org.apache.hadoop.io.Text
|
import org.apache.hadoop.io.Text
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
|
@ -3,8 +3,7 @@ package eu.dnetlib.dhp.datacite
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
||||||
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH
|
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||||
import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
|
@ -7,6 +7,7 @@ import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||||
import collection.JavaConverters._
|
import collection.JavaConverters._
|
||||||
|
|
||||||
object BioDBToOAF {
|
object BioDBToOAF {
|
||||||
|
|
||||||
case class EBILinkItem(id: Long, links: String) {}
|
case class EBILinkItem(id: Long, links: String) {}
|
|
@ -1,9 +1,9 @@
|
||||||
package eu.dnetlib.dhp.sx.bio
|
package eu.dnetlib.dhp.sx.bio
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
|
||||||
import BioDBToOAF.ScholixResolved
|
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
@ -36,13 +36,13 @@ object SparkTransformBioDatabaseToOAF {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
database.toUpperCase() match {
|
database.toUpperCase() match {
|
||||||
case "UNIPROT" =>
|
case "UNIPROT" =>
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
case "PDB" =>
|
case "PDB" =>
|
||||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
case "SCHOLIX" =>
|
case "SCHOLIX" =>
|
||||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
case "CROSSREF_LINKS" =>
|
case "CROSSREF_LINKS" =>
|
||||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result
|
import eu.dnetlib.dhp.schema.oaf.Result
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
|
@ -1,9 +1,8 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.ebi
|
package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal
|
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.http.client.config.RequestConfig
|
import org.apache.http.client.config.RequestConfig
|
||||||
import org.apache.http.client.methods.HttpGet
|
import org.apache.http.client.methods.HttpGet
|
|
@ -1,11 +1,10 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.ebi
|
package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||||
import BioDBToOAF.EBILinkItem
|
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
|
@ -38,7 +37,7 @@ object SparkEBILinksToOaf {
|
||||||
ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
|
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
|
||||||
.flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null)
|
.flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null)
|
||||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -4,10 +4,9 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||||
import eu.dnetlib.dhp.schema.oaf._
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
|
import collection.JavaConverters._
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
@ -22,10 +21,10 @@ object PubMedToOaf {
|
||||||
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleaning the DOI Applying regex in order to
|
* Cleaning the DOI Applying regex in order to
|
||||||
* remove doi starting with URL
|
* remove doi starting with URL
|
||||||
|
*
|
||||||
* @param doi input DOI
|
* @param doi input DOI
|
||||||
* @return cleaned DOI
|
* @return cleaned DOI
|
||||||
*/
|
*/
|
||||||
|
@ -93,7 +92,6 @@ object PubMedToOaf {
|
||||||
* @param vocabularyName the input vocabulary name
|
* @param vocabularyName the input vocabulary name
|
||||||
* @param vocabularies all the vocabularies
|
* @param vocabularies all the vocabularies
|
||||||
* @param term the term to find
|
* @param term the term to find
|
||||||
*
|
|
||||||
* @return the cleaned term value
|
* @return the cleaned term value
|
||||||
*/
|
*/
|
||||||
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
|
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
|
||||||
|
@ -106,7 +104,6 @@ object PubMedToOaf {
|
||||||
/**
|
/**
|
||||||
* Map the Pubmed Article into the OAF instance
|
* Map the Pubmed Article into the OAF instance
|
||||||
*
|
*
|
||||||
*
|
|
||||||
* @param article the pubmed articles
|
* @param article the pubmed articles
|
||||||
* @param vocabularies the vocabularies
|
* @param vocabularies the vocabularies
|
||||||
* @return The OAF instance if the mapping did not fail
|
* @return The OAF instance if the mapping did not fail
|
||||||
|
@ -185,7 +182,6 @@ object PubMedToOaf {
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// RESULT MAPPING
|
// RESULT MAPPING
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
Loading…
Reference in New Issue