forked from D-Net/dnet-hadoop
Revert to 4094f2bb9a
This commit is contained in:
parent
9c82d670b8
commit
bd9a43cefd
|
@ -27,8 +27,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
public static final int ORCID_LEN = 19;
|
public static final int ORCID_LEN = 19;
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||||
public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d";
|
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
|
||||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
|
||||||
|
|
||||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||||
if (value instanceof Datasource) {
|
if (value instanceof Datasource) {
|
||||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.actionmanager.scholix
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, Oaf, Publication, Software, OtherResearchProduct, Relation}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
|
||||||
import org.apache.hadoop.io.Text
|
import org.apache.hadoop.io.Text
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
|
@ -3,7 +3,8 @@ package eu.dnetlib.dhp.datacite
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
||||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH
|
||||||
|
import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
|
@ -7,7 +7,6 @@ import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||||
import collection.JavaConverters._
|
import collection.JavaConverters._
|
||||||
|
|
||||||
object BioDBToOAF {
|
object BioDBToOAF {
|
||||||
|
|
||||||
case class EBILinkItem(id: Long, links: String) {}
|
case class EBILinkItem(id: Long, links: String) {}
|
|
@ -1,9 +1,9 @@
|
||||||
package eu.dnetlib.dhp.sx.bio
|
package eu.dnetlib.dhp.sx.bio
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
import BioDBToOAF.ScholixResolved
|
||||||
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result
|
import eu.dnetlib.dhp.schema.oaf.Result
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
|
@ -1,8 +1,9 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.ebi
|
package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||||
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||||
|
import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.http.client.config.RequestConfig
|
import org.apache.http.client.config.RequestConfig
|
||||||
import org.apache.http.client.methods.HttpGet
|
import org.apache.http.client.methods.HttpGet
|
|
@ -1,10 +1,11 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.ebi
|
package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
|
||||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||||
|
import BioDBToOAF.EBILinkItem
|
||||||
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
|
@ -4,9 +4,10 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||||
import eu.dnetlib.dhp.schema.oaf._
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
import collection.JavaConverters._
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
@ -21,10 +22,10 @@ object PubMedToOaf {
|
||||||
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleaning the DOI Applying regex in order to
|
* Cleaning the DOI Applying regex in order to
|
||||||
* remove doi starting with URL
|
* remove doi starting with URL
|
||||||
*
|
|
||||||
* @param doi input DOI
|
* @param doi input DOI
|
||||||
* @return cleaned DOI
|
* @return cleaned DOI
|
||||||
*/
|
*/
|
||||||
|
@ -92,6 +93,7 @@ object PubMedToOaf {
|
||||||
* @param vocabularyName the input vocabulary name
|
* @param vocabularyName the input vocabulary name
|
||||||
* @param vocabularies all the vocabularies
|
* @param vocabularies all the vocabularies
|
||||||
* @param term the term to find
|
* @param term the term to find
|
||||||
|
*
|
||||||
* @return the cleaned term value
|
* @return the cleaned term value
|
||||||
*/
|
*/
|
||||||
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
|
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
|
||||||
|
@ -104,6 +106,7 @@ object PubMedToOaf {
|
||||||
/**
|
/**
|
||||||
* Map the Pubmed Article into the OAF instance
|
* Map the Pubmed Article into the OAF instance
|
||||||
*
|
*
|
||||||
|
*
|
||||||
* @param article the pubmed articles
|
* @param article the pubmed articles
|
||||||
* @param vocabularies the vocabularies
|
* @param vocabularies the vocabularies
|
||||||
* @return The OAF instance if the mapping did not fail
|
* @return The OAF instance if the mapping did not fail
|
||||||
|
@ -182,6 +185,7 @@ object PubMedToOaf {
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// RESULT MAPPING
|
// RESULT MAPPING
|
||||||
//--------------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------------
|
||||||
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
|
@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
public class UpdateMatcherTest {
|
class UpdateMatcherTest {
|
||||||
|
|
||||||
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
|
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
|
|
||||||
public class EnrichMissingPublicationDateTest {
|
class EnrichMissingPublicationDateTest {
|
||||||
|
|
||||||
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
|
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ import java.util.Arrays;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
public class SubscriptionUtilsTest {
|
class SubscriptionUtilsTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testVerifyListSimilar() {
|
void testVerifyListSimilar() {
|
||||||
|
|
|
@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||||
|
|
||||||
public class TrustUtilsTest {
|
class TrustUtilsTest {
|
||||||
|
|
||||||
private static final double THRESHOLD = 0.95;
|
private static final double THRESHOLD = 0.95;
|
||||||
|
|
||||||
|
|
|
@ -4,19 +4,20 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf._
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{decideAccessRight, _}
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
|
||||||
import org.apache.commons.lang.StringUtils
|
import org.apache.commons.lang.StringUtils
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST._
|
import org.json4s.JsonAST.{JValue, _}
|
||||||
import org.json4s.jackson.JsonMethods._
|
import org.json4s.jackson.JsonMethods._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import java.util
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.collection.mutable
|
import scala.collection.mutable
|
||||||
import scala.util.matching.Regex
|
import scala.util.matching.Regex
|
||||||
|
import java.util
|
||||||
|
|
||||||
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
|
|
||||||
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
|
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
|
||||||
|
|
|
@ -6,7 +6,7 @@ import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.io.{IntWritable, Text}
|
import org.apache.hadoop.io.{IntWritable, Text}
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
@ -30,6 +30,7 @@ object CrossrefDataset {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
|
@ -2,12 +2,17 @@ package eu.dnetlib.doiboost.crossref
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
|
import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item
|
||||||
|
import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass
|
||||||
|
import org.apache.hadoop.io.{IntWritable, Text}
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.JsonAST.JArray
|
||||||
|
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
@ -19,6 +24,7 @@ object GenerateCrossrefDataset {
|
||||||
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
|
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def crossrefElement(meta: String): CrossrefDT = {
|
def crossrefElement(meta: String): CrossrefDT = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(meta)
|
lazy val json: json4s.JValue = parse(meta)
|
|
@ -4,8 +4,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf
|
import eu.dnetlib.dhp.schema.oaf
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
|
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
|
||||||
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,8 @@ package eu.dnetlib.doiboost.crossref
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
import org.apache.spark.sql.SparkSession
|
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST.JArray
|
import org.json4s.JsonAST.JArray
|
||||||
|
@ -17,6 +17,8 @@ object UnpackCrtossrefEntries {
|
||||||
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
|
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extractDump(input:String):List[String] = {
|
def extractDump(input:String):List[String] = {
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
@ -28,6 +30,7 @@ object UnpackCrtossrefEntries {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf = new SparkConf
|
val conf = new SparkConf
|
||||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
|
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
|
|
@ -5,10 +5,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.collection.mutable
|
import scala.collection.mutable
|
|
@ -3,8 +3,8 @@ package eu.dnetlib.doiboost.mag
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.types._
|
|
||||||
import org.apache.spark.sql.{SaveMode, SparkSession}
|
import org.apache.spark.sql.{SaveMode, SparkSession}
|
||||||
|
import org.apache.spark.sql.types._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkImportMagIntoDataset {
|
object SparkImportMagIntoDataset {
|
||||||
|
@ -75,6 +75,7 @@ object SparkImportMagIntoDataset {
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
stream.foreach { case (k, v) =>
|
stream.foreach { case (k, v) =>
|
||||||
val s: StructType = getSchema(k)
|
val s: StructType = getSchema(k)
|
||||||
val df = spark.read
|
val df = spark.read
|
|
@ -5,10 +5,13 @@ import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.functions.{col, collect_list, struct}
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.functions._
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkProcessMAG {
|
object SparkProcessMAG {
|
||||||
|
|
||||||
def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={
|
def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={
|
||||||
|
@ -150,5 +153,6 @@ object SparkProcessMAG {
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -4,16 +4,17 @@ import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI}
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
|
||||||
import org.apache.commons.lang.StringUtils
|
import org.apache.commons.lang.StringUtils
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.JsonAST._
|
import org.json4s.JsonAST._
|
||||||
import org.json4s.jackson.JsonMethods._
|
import org.json4s.jackson.JsonMethods._
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
|
|
||||||
case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}
|
case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}
|
|
@ -1,12 +1,15 @@
|
||||||
package eu.dnetlib.doiboost.orcid
|
package eu.dnetlib.doiboost.orcid
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.functions.{col, collect_list}
|
import org.apache.spark.sql.functions._
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkPreprocessORCID {
|
object SparkPreprocessORCID {
|
|
@ -1,14 +1,16 @@
|
||||||
package eu.dnetlib.doiboost.uw
|
package eu.dnetlib.doiboost.uw
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF
|
import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
|
||||||
object SparkMapUnpayWallToOAF {
|
object SparkMapUnpayWallToOAF {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
|
@ -4,13 +4,14 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
||||||
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication}
|
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication}
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||||
|
import eu.dnetlib.doiboost.uw.UnpayWallToOAF.get_unpaywall_color
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.doiboost
|
package eu.dnetlib.dhp.doiboost
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
|
||||||
import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType}
|
import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType}
|
|
@ -1,4 +1,4 @@
|
||||||
package eu.dnetlib.doiboost
|
package eu.dnetlib.dhp.doiboost
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
|
@ -3,9 +3,9 @@ package eu.dnetlib.doiboost.mag
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, SparkSession}
|
import org.apache.spark.sql.{Dataset, SparkSession}
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
import org.codehaus.jackson.map.ObjectMapper
|
||||||
import org.json4s.DefaultFormats
|
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import java.sql.Timestamp
|
import java.sql.Timestamp
|
|
@ -10,9 +10,10 @@ import org.junit.jupiter.api.io.TempDir
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import scala.collection.JavaConversions._
|
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
|
||||||
|
import scala.collection.JavaConversions._
|
||||||
|
|
||||||
class MappingORCIDToOAFTest {
|
class MappingORCIDToOAFTest {
|
||||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
|
@ -3,11 +3,11 @@ package eu.dnetlib.doiboost.uw
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute
|
import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute
|
||||||
import org.junit.jupiter.api.Assertions._
|
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
|
||||||
|
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
import org.junit.jupiter.api.Assertions._
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
class UnpayWallMappingTest {
|
class UnpayWallMappingTest {
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
|
|
||||||
|
|
||||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
|
@ -2,12 +2,13 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToResult.{applyHBtoPubs, getClass}
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource
|
import eu.dnetlib.dhp.schema.oaf.{Datasource, Publication}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
|
@ -5,13 +5,16 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Instance, OpenAccessRoute, Publication}
|
import eu.dnetlib.dhp.schema.oaf.{Datasource, Instance, OpenAccessRoute, Publication}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
|
||||||
object SparkApplyHostedByMapToResult {
|
object SparkApplyHostedByMapToResult {
|
||||||
|
|
||||||
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
|
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
|
||||||
|
@ -36,7 +39,6 @@ object SparkApplyHostedByMapToResult {
|
||||||
p
|
p
|
||||||
})(Encoders.bean(classOf[Publication]))
|
})(Encoders.bean(classOf[Publication]))
|
||||||
}
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
|
@ -3,15 +3,18 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Journal, Publication}
|
import eu.dnetlib.dhp.schema.oaf.{Journal, Publication}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
object SparkPrepareHostedByInfoToApply {
|
object SparkPrepareHostedByInfoToApply {
|
||||||
|
|
||||||
implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
|
|
@ -1,20 +1,22 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel}
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel}
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource
|
import eu.dnetlib.dhp.schema.oaf.Datasource
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
|
||||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
import org.apache.hadoop.fs.FileSystem
|
||||||
|
import org.apache.hadoop.fs.Path
|
||||||
import java.io.PrintWriter
|
import java.io.PrintWriter
|
||||||
|
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
|
|
||||||
|
|
||||||
object SparkProduceHostedByMap {
|
object SparkProduceHostedByMap {
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,6 +52,7 @@ object SparkProduceHostedByMap {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getHostedByItemType(id:String, officialname: String, issn:String, eissn:String, issnl:String, oa:Boolean): HostedByItemType = {
|
def getHostedByItemType(id:String, officialname: String, issn:String, eissn:String, issnl:String, oa:Boolean): HostedByItemType = {
|
||||||
if(issn != null){
|
if(issn != null){
|
||||||
if(eissn != null){
|
if(eissn != null){
|
||||||
|
@ -160,6 +163,7 @@ object SparkProduceHostedByMap {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode : String):Unit = {
|
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode : String):Unit = {
|
||||||
val conf = new Configuration()
|
val conf = new Configuration()
|
||||||
|
|
||||||
|
@ -178,6 +182,7 @@ object SparkProduceHostedByMap {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
|
@ -4,14 +4,20 @@ import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport
|
import eu.dnetlib.dhp.common.HdfsSupport
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||||
|
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.commons.lang3.StringUtils
|
||||||
|
import org.apache.http.client.methods.HttpGet
|
||||||
|
import org.apache.http.impl.client.HttpClients
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
import org.slf4j.LoggerFactory
|
import org.slf4j.LoggerFactory
|
||||||
|
|
||||||
import scala.io.Source
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
object CopyHdfsOafSparkApplication {
|
object CopyHdfsOafSparkApplication {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.graph.resolution
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.common.HdfsSupport
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType
|
import eu.dnetlib.dhp.schema.common.EntityType
|
||||||
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport
|
import eu.dnetlib.dhp.common.HdfsSupport
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
import org.apache.hadoop.fs.{FileSystem, Path}
|
|
@ -0,0 +1,31 @@
|
||||||
|
package eu.dnetlib.dhp.oa.sx.graphimport
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.SparkSession
|
||||||
|
|
||||||
|
object SparkDataciteToOAF {
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val conf: SparkConf = new SparkConf()
|
||||||
|
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/datacite_to_df_params.json")))
|
||||||
|
parser.parseArgument(args)
|
||||||
|
val spark: SparkSession =
|
||||||
|
SparkSession
|
||||||
|
.builder()
|
||||||
|
.config(conf)
|
||||||
|
.appName(getClass.getSimpleName)
|
||||||
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
|
val sc = spark.sparkContext
|
||||||
|
|
||||||
|
val inputPath = parser.get("inputPath")
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -2,7 +2,7 @@ package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
|
@ -5,10 +5,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
|
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
|
||||||
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
import org.apache.hadoop.io.compress._
|
||||||
|
|
||||||
object SparkConvertObjectToJson {
|
object SparkConvertObjectToJson {
|
||||||
|
|
|
@ -2,12 +2,11 @@ package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Software, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkConvertRDDtoDataset {
|
object SparkConvertRDDtoDataset {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
|
@ -1,12 +1,14 @@
|
||||||
package eu.dnetlib.dhp.sx.graph
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
object SparkCreateInputGraph {
|
object SparkCreateInputGraph {
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
@ -39,6 +41,9 @@ object SparkCreateInputGraph {
|
||||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
log.info(s"sourcePath -> $sourcePath")
|
log.info(s"sourcePath -> $sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
|
@ -9,7 +9,7 @@ import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.functions.count
|
import org.apache.spark.sql.functions.count
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkCreateScholix {
|
object SparkCreateScholix {
|
|
@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
||||||
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkCreateSummaryObject {
|
object SparkCreateSummaryObject {
|
|
@ -5,7 +5,6 @@ import org.apache.spark.sql.{Encoder, Encoders}
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
import scala.language.postfixOps
|
import scala.language.postfixOps
|
||||||
import scala.xml.{Elem, Node, XML}
|
import scala.xml.{Elem, Node, XML}
|
|
@ -2,12 +2,13 @@ package eu.dnetlib.dhp.sx.graph.pangaea
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.io.Source
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
object SparkGeneratePanagaeaDataset {
|
object SparkGeneratePanagaeaDataset {
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,4 +46,7 @@ object SparkGeneratePanagaeaDataset {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.dhp.sx.graph.scholix
|
package eu.dnetlib.dhp.sx.graph.scholix
|
||||||
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
|
||||||
import eu.dnetlib.dhp.schema.sx.scholix._
|
import eu.dnetlib.dhp.schema.sx.scholix._
|
||||||
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
|
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
|
||||||
|
@ -10,9 +11,9 @@ import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
import scala.io.Source
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
|
import scala.language.postfixOps
|
||||||
|
|
||||||
object ScholixUtils {
|
object ScholixUtils {
|
||||||
|
|
||||||
|
@ -20,7 +21,6 @@ object ScholixUtils {
|
||||||
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
||||||
|
|
||||||
val DATE_RELATION_KEY:String = "RelationDate"
|
val DATE_RELATION_KEY:String = "RelationDate"
|
||||||
|
|
||||||
case class RelationVocabulary(original:String, inverse:String){}
|
case class RelationVocabulary(original:String, inverse:String){}
|
||||||
|
|
||||||
case class RelatedEntities(id:String, relatedDataset:Long, relatedPublication:Long){}
|
case class RelatedEntities(id:String, relatedDataset:Long, relatedPublication:Long){}
|
||||||
|
@ -66,6 +66,7 @@ object ScholixUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val statsAggregator:Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
val statsAggregator:Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
||||||
override def zero: RelatedEntities = null
|
override def zero: RelatedEntities = null
|
||||||
|
|
||||||
|
@ -84,7 +85,8 @@ object ScholixUtils {
|
||||||
if (b1!= null && b2!= null)
|
if (b1!= null && b2!= null)
|
||||||
RelatedEntities(b1.id, b1.relatedDataset+ b2.relatedDataset, b1.relatedPublication+ b2.relatedPublication)
|
RelatedEntities(b1.id, b1.relatedDataset+ b2.relatedDataset, b1.relatedPublication+ b2.relatedPublication)
|
||||||
|
|
||||||
else if (b1 != null)
|
else
|
||||||
|
if (b1!= null)
|
||||||
b1
|
b1
|
||||||
else
|
else
|
||||||
b2
|
b2
|
||||||
|
@ -142,6 +144,7 @@ object ScholixUtils {
|
||||||
s
|
s
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -200,7 +203,8 @@ object ScholixUtils {
|
||||||
|
|
||||||
if (summaryObject.getDate!= null && !summaryObject.getDate.isEmpty)
|
if (summaryObject.getDate!= null && !summaryObject.getDate.isEmpty)
|
||||||
r.setPublicationDate(summaryObject.getDate.get(0))
|
r.setPublicationDate(summaryObject.getDate.get(0))
|
||||||
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
|
if (summaryObject.getPublisher!= null && !summaryObject.getPublisher.isEmpty)
|
||||||
|
{
|
||||||
val plist:List[ScholixEntityId] =summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
val plist:List[ScholixEntityId] =summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
||||||
|
|
||||||
if (plist.nonEmpty)
|
if (plist.nonEmpty)
|
||||||
|
@ -224,6 +228,9 @@ object ScholixUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def scholixFromSource(relation:Relation, source:ScholixSummary):Scholix = {
|
def scholixFromSource(relation:Relation, source:ScholixSummary):Scholix = {
|
||||||
|
|
||||||
if (relation== null || source== null)
|
if (relation== null || source== null)
|
|
@ -3,9 +3,13 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{joinResHBM, prepareResultInfo, toEntityInfo}
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkPrepareHostedByInfoToApply.{joinResHBM, prepareResultInfo, toEntityInfo}
|
||||||
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Datasource, OpenAccessRoute, Publication}
|
||||||
|
import javax.management.openmbean.OpenMBeanAttributeInfo
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||||
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
|
import org.junit.jupiter.api.Assertions.{assertEquals, assertTrue}
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
|
@ -4,9 +4,10 @@ import eu.dnetlib.dhp.schema.oaf.Datasource
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.Serialization.write
|
import org.junit.jupiter.api.Assertions.{assertNotNull, assertTrue}
|
||||||
import org.junit.jupiter.api.Assertions._
|
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
import org.junit.jupiter.api.Assertions._
|
||||||
|
import org.json4s.jackson.Serialization.write
|
||||||
|
|
||||||
class TestPreprocess extends java.io.Serializable{
|
class TestPreprocess extends java.io.Serializable{
|
||||||
|
|
|
@ -159,7 +159,6 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
|
|
||||||
|
|
||||||
val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
|
val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
|
||||||
|
|
||||||
val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.sx.pangaea
|
||||||
import eu.dnetlib.dhp.sx.graph.pangaea.PangaeaUtils
|
import eu.dnetlib.dhp.sx.graph.pangaea.PangaeaUtils
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
|
import java.util.TimeZone
|
||||||
import java.text.SimpleDateFormat
|
import java.text.SimpleDateFormat
|
||||||
import java.util.Date
|
import java.util.Date
|
||||||
import scala.io.Source
|
import scala.io.Source
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue