2024-02-29 18:16:15 +01:00
|
|
|
package eu.dnetlib.dhp.collection.mag
|
|
|
|
|
2024-04-05 11:10:44 +02:00
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper
|
2024-03-08 16:31:40 +01:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
|
2024-04-05 11:10:44 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.{OafMapperUtils, PidType}
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, Journal, Publication, Result, Dataset => OafDataset}
|
2024-04-03 17:07:14 +02:00
|
|
|
import eu.dnetlib.dhp.utils.DHPUtils
|
2024-02-29 18:16:15 +01:00
|
|
|
import org.apache.spark.sql.types._
|
|
|
|
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
|
|
|
import org.json4s
|
|
|
|
import org.json4s.DefaultFormats
|
|
|
|
import org.json4s.jackson.JsonMethods.parse
|
|
|
|
|
2024-03-08 16:31:40 +01:00
|
|
|
import scala.collection.JavaConverters._
|
|
|
|
|
2024-03-05 11:44:59 +01:00
|
|
|
case class MAGPaper(
|
|
|
|
paperId: Option[Long],
|
|
|
|
doi: Option[String],
|
|
|
|
docType: Option[String],
|
|
|
|
paperTitle: Option[String],
|
|
|
|
originalTitle: Option[String],
|
|
|
|
bookTitle: Option[String],
|
|
|
|
year: Option[Int],
|
|
|
|
date: Option[String],
|
|
|
|
onlineDate: Option[String],
|
|
|
|
publisher: Option[String],
|
|
|
|
journalId: Option[Long],
|
|
|
|
journalName: Option[String],
|
|
|
|
journalIssn: Option[String],
|
|
|
|
journalPublisher: Option[String],
|
|
|
|
conferenceSeriesId: Option[Long],
|
|
|
|
conferenceInstanceId: Option[Long],
|
|
|
|
conferenceName: Option[String],
|
|
|
|
conferenceLocation: Option[String],
|
|
|
|
conferenceStartDate: Option[String],
|
|
|
|
conferenceEndDate: Option[String],
|
|
|
|
volume: Option[String],
|
|
|
|
issue: Option[String],
|
|
|
|
firstPage: Option[String],
|
|
|
|
lastPage: Option[String],
|
|
|
|
referenceCount: Option[Long],
|
|
|
|
citationCount: Option[Long],
|
|
|
|
estimatedCitation: Option[Long],
|
|
|
|
originalVenue: Option[String],
|
|
|
|
familyId: Option[Long],
|
|
|
|
familyRank: Option[Int],
|
|
|
|
docSubTypes: Option[String],
|
|
|
|
createdDate: Option[String],
|
|
|
|
abstractText: Option[String],
|
|
|
|
authors: Option[List[MAGAuthor]],
|
2024-03-13 09:46:31 +01:00
|
|
|
urls: Option[List[String]]
|
2024-03-05 11:44:59 +01:00
|
|
|
)
|
2024-02-29 18:16:15 +01:00
|
|
|
|
2024-03-05 11:44:59 +01:00
|
|
|
case class MAGAuthor(
|
|
|
|
AffiliationId: Option[Long],
|
|
|
|
AuthorSequenceNumber: Option[Int],
|
|
|
|
AffiliationName: Option[String],
|
|
|
|
AuthorName: Option[String],
|
|
|
|
AuthorId: Option[Long],
|
|
|
|
GridId: Option[String]
|
|
|
|
)
|
|
|
|
|
|
|
|
object MagUtility extends Serializable {
|
2024-02-29 18:16:15 +01:00
|
|
|
|
2024-04-05 11:10:44 +02:00
|
|
|
val mapper = new ObjectMapper()
|
|
|
|
|
|
|
|
val MAGCollectedFrom =keyValue(ModelConstants.MAG_ID,ModelConstants.MAG_NAME)
|
|
|
|
|
2024-02-29 18:16:15 +01:00
|
|
|
val datatypedict = Map(
|
|
|
|
"bool" -> BooleanType,
|
|
|
|
"int" -> IntegerType,
|
|
|
|
"uint" -> IntegerType,
|
|
|
|
"long" -> LongType,
|
|
|
|
"ulong" -> LongType,
|
|
|
|
"float" -> FloatType,
|
|
|
|
"string" -> StringType,
|
|
|
|
"DateTime" -> DateType
|
|
|
|
)
|
|
|
|
|
|
|
|
val stream = Map(
|
|
|
|
"Affiliations" -> Tuple2(
|
|
|
|
"mag/Affiliations.txt",
|
|
|
|
Seq(
|
|
|
|
"AffiliationId:long",
|
|
|
|
"Rank:uint",
|
|
|
|
"NormalizedName:string",
|
|
|
|
"DisplayName:string",
|
|
|
|
"GridId:string",
|
|
|
|
"OfficialPage:string",
|
|
|
|
"WikiPage:string",
|
|
|
|
"PaperCount:long",
|
|
|
|
"PaperFamilyCount:long",
|
|
|
|
"CitationCount:long",
|
|
|
|
"Iso3166Code:string",
|
|
|
|
"Latitude:float?",
|
|
|
|
"Longitude:float?",
|
|
|
|
"CreatedDate:DateTime"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"AuthorExtendedAttributes" -> Tuple2(
|
|
|
|
"mag/AuthorExtendedAttributes.txt",
|
|
|
|
Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")
|
|
|
|
),
|
|
|
|
"Authors" -> Tuple2(
|
|
|
|
"mag/Authors.txt",
|
|
|
|
Seq(
|
|
|
|
"AuthorId:long",
|
|
|
|
"Rank:uint",
|
|
|
|
"NormalizedName:string",
|
|
|
|
"DisplayName:string",
|
|
|
|
"LastKnownAffiliationId:long?",
|
|
|
|
"PaperCount:long",
|
|
|
|
"PaperFamilyCount:long",
|
|
|
|
"CitationCount:long",
|
|
|
|
"CreatedDate:DateTime"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"ConferenceInstances" -> Tuple2(
|
|
|
|
"mag/ConferenceInstances.txt",
|
|
|
|
Seq(
|
|
|
|
"ConferenceInstanceId:long",
|
|
|
|
"NormalizedName:string",
|
|
|
|
"DisplayName:string",
|
|
|
|
"ConferenceSeriesId:long",
|
|
|
|
"Location:string",
|
|
|
|
"OfficialUrl:string",
|
|
|
|
"StartDate:DateTime?",
|
|
|
|
"EndDate:DateTime?",
|
|
|
|
"AbstractRegistrationDate:DateTime?",
|
|
|
|
"SubmissionDeadlineDate:DateTime?",
|
|
|
|
"NotificationDueDate:DateTime?",
|
|
|
|
"FinalVersionDueDate:DateTime?",
|
|
|
|
"PaperCount:long",
|
|
|
|
"PaperFamilyCount:long",
|
|
|
|
"CitationCount:long",
|
|
|
|
"Latitude:float?",
|
|
|
|
"Longitude:float?",
|
|
|
|
"CreatedDate:DateTime"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"ConferenceSeries" -> Tuple2(
|
|
|
|
"mag/ConferenceSeries.txt",
|
|
|
|
Seq(
|
|
|
|
"ConferenceSeriesId:long",
|
|
|
|
"Rank:uint",
|
|
|
|
"NormalizedName:string",
|
|
|
|
"DisplayName:string",
|
|
|
|
"PaperCount:long",
|
|
|
|
"PaperFamilyCount:long",
|
|
|
|
"CitationCount:long",
|
|
|
|
"CreatedDate:DateTime"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"EntityRelatedEntities" -> Tuple2(
|
|
|
|
"advanced/EntityRelatedEntities.txt",
|
|
|
|
Seq(
|
|
|
|
"EntityId:long",
|
|
|
|
"EntityType:string",
|
|
|
|
"RelatedEntityId:long",
|
|
|
|
"RelatedEntityType:string",
|
|
|
|
"RelatedType:int",
|
|
|
|
"Score:float"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"FieldOfStudyChildren" -> Tuple2(
|
|
|
|
"advanced/FieldOfStudyChildren.txt",
|
|
|
|
Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")
|
|
|
|
),
|
|
|
|
"FieldOfStudyExtendedAttributes" -> Tuple2(
|
|
|
|
"advanced/FieldOfStudyExtendedAttributes.txt",
|
|
|
|
Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")
|
|
|
|
),
|
|
|
|
"FieldsOfStudy" -> Tuple2(
|
|
|
|
"advanced/FieldsOfStudy.txt",
|
|
|
|
Seq(
|
|
|
|
"FieldOfStudyId:long",
|
|
|
|
"Rank:uint",
|
|
|
|
"NormalizedName:string",
|
|
|
|
"DisplayName:string",
|
|
|
|
"MainType:string",
|
|
|
|
"Level:int",
|
|
|
|
"PaperCount:long",
|
|
|
|
"PaperFamilyCount:long",
|
|
|
|
"CitationCount:long",
|
|
|
|
"CreatedDate:DateTime"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"Journals" -> Tuple2(
|
|
|
|
"mag/Journals.txt",
|
|
|
|
Seq(
|
|
|
|
"JournalId:long",
|
|
|
|
"Rank:uint",
|
|
|
|
"NormalizedName:string",
|
|
|
|
"DisplayName:string",
|
|
|
|
"Issn:string",
|
|
|
|
"Publisher:string",
|
|
|
|
"Webpage:string",
|
|
|
|
"PaperCount:long",
|
|
|
|
"PaperFamilyCount:long",
|
|
|
|
"CitationCount:long",
|
|
|
|
"CreatedDate:DateTime"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"PaperAbstractsInvertedIndex" -> Tuple2(
|
|
|
|
"nlp/PaperAbstractsInvertedIndex.txt.*",
|
|
|
|
Seq("PaperId:long", "IndexedAbstract:string")
|
|
|
|
),
|
|
|
|
"PaperAuthorAffiliations" -> Tuple2(
|
|
|
|
"mag/PaperAuthorAffiliations.txt",
|
|
|
|
Seq(
|
|
|
|
"PaperId:long",
|
|
|
|
"AuthorId:long",
|
|
|
|
"AffiliationId:long?",
|
|
|
|
"AuthorSequenceNumber:uint",
|
|
|
|
"OriginalAuthor:string",
|
|
|
|
"OriginalAffiliation:string"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"PaperCitationContexts" -> Tuple2(
|
|
|
|
"nlp/PaperCitationContexts.txt",
|
|
|
|
Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")
|
|
|
|
),
|
|
|
|
"PaperExtendedAttributes" -> Tuple2(
|
|
|
|
"mag/PaperExtendedAttributes.txt",
|
|
|
|
Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")
|
|
|
|
),
|
|
|
|
"PaperFieldsOfStudy" -> Tuple2(
|
|
|
|
"advanced/PaperFieldsOfStudy.txt",
|
|
|
|
Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")
|
|
|
|
),
|
|
|
|
"PaperMeSH" -> Tuple2(
|
|
|
|
"advanced/PaperMeSH.txt",
|
|
|
|
Seq(
|
|
|
|
"PaperId:long",
|
|
|
|
"DescriptorUI:string",
|
|
|
|
"DescriptorName:string",
|
|
|
|
"QualifierUI:string",
|
|
|
|
"QualifierName:string",
|
|
|
|
"IsMajorTopic:bool"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"PaperRecommendations" -> Tuple2(
|
|
|
|
"advanced/PaperRecommendations.txt",
|
|
|
|
Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")
|
|
|
|
),
|
|
|
|
"PaperReferences" -> Tuple2(
|
|
|
|
"mag/PaperReferences.txt",
|
|
|
|
Seq("PaperId:long", "PaperReferenceId:long")
|
|
|
|
),
|
|
|
|
"PaperResources" -> Tuple2(
|
|
|
|
"mag/PaperResources.txt",
|
|
|
|
Seq(
|
|
|
|
"PaperId:long",
|
|
|
|
"ResourceType:int",
|
|
|
|
"ResourceUrl:string",
|
|
|
|
"SourceUrl:string",
|
|
|
|
"RelationshipType:int"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"PaperUrls" -> Tuple2(
|
|
|
|
"mag/PaperUrls.txt",
|
|
|
|
Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")
|
|
|
|
),
|
|
|
|
"Papers" -> Tuple2(
|
|
|
|
"mag/Papers.txt",
|
|
|
|
Seq(
|
|
|
|
"PaperId:long",
|
|
|
|
"Rank:uint",
|
|
|
|
"Doi:string",
|
|
|
|
"DocType:string",
|
|
|
|
"PaperTitle:string",
|
|
|
|
"OriginalTitle:string",
|
|
|
|
"BookTitle:string",
|
|
|
|
"Year:int?",
|
|
|
|
"Date:DateTime?",
|
|
|
|
"OnlineDate:DateTime?",
|
|
|
|
"Publisher:string",
|
|
|
|
"JournalId:long?",
|
|
|
|
"ConferenceSeriesId:long?",
|
|
|
|
"ConferenceInstanceId:long?",
|
|
|
|
"Volume:string",
|
|
|
|
"Issue:string",
|
|
|
|
"FirstPage:string",
|
|
|
|
"LastPage:string",
|
|
|
|
"ReferenceCount:long",
|
|
|
|
"CitationCount:long",
|
|
|
|
"EstimatedCitation:long",
|
|
|
|
"OriginalVenue:string",
|
|
|
|
"FamilyId:long?",
|
|
|
|
"FamilyRank:uint?",
|
|
|
|
"DocSubTypes:string",
|
|
|
|
"CreatedDate:DateTime"
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"RelatedFieldOfStudy" -> Tuple2(
|
|
|
|
"advanced/RelatedFieldOfStudy.txt",
|
|
|
|
Seq(
|
|
|
|
"FieldOfStudyId1:long",
|
|
|
|
"Type1:string",
|
|
|
|
"FieldOfStudyId2:long",
|
|
|
|
"Type2:string",
|
|
|
|
"Rank:float"
|
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
def getSchema(streamName: String): StructType = {
|
|
|
|
var schema = new StructType()
|
|
|
|
val d: Seq[String] = stream(streamName)._2
|
2024-04-03 17:07:14 +02:00
|
|
|
d.foreach {
|
|
|
|
case t =>
|
|
|
|
val currentType = t.split(":")
|
|
|
|
val fieldName: String = currentType.head
|
|
|
|
var fieldType: String = currentType.last
|
|
|
|
val nullable: Boolean = fieldType.endsWith("?")
|
|
|
|
if (nullable)
|
|
|
|
fieldType = fieldType.replace("?", "")
|
|
|
|
schema = schema.add(StructField(fieldName, datatypedict(fieldType), nullable))
|
2024-02-29 18:16:15 +01:00
|
|
|
}
|
|
|
|
schema
|
|
|
|
}
|
|
|
|
|
2024-03-05 11:44:59 +01:00
|
|
|
def loadMagEntity(spark: SparkSession, entity: String, basePath: String): Dataset[Row] = {
|
2024-02-29 18:16:15 +01:00
|
|
|
if (stream.contains(entity)) {
|
2024-03-05 11:44:59 +01:00
|
|
|
val s = getSchema(entity)
|
2024-02-29 18:16:15 +01:00
|
|
|
val pt = stream(entity)._1
|
|
|
|
spark.read
|
|
|
|
.option("header", "false")
|
|
|
|
.option("charset", "UTF8")
|
|
|
|
.option("delimiter", "\t")
|
|
|
|
.schema(s)
|
|
|
|
.csv(s"$basePath/$pt")
|
|
|
|
} else
|
|
|
|
null
|
|
|
|
|
|
|
|
}
|
2024-03-05 11:44:59 +01:00
|
|
|
|
2024-04-04 21:06:04 +02:00
|
|
|
def createResultFromType(magType: Option[String], source: Option[String]): Result = {
|
|
|
|
var result: Result = null
|
2024-04-03 17:07:14 +02:00
|
|
|
val di = new DataInfo
|
|
|
|
di.setDeletedbyinference(false)
|
|
|
|
di.setInferred(false)
|
|
|
|
di.setInvisible(false)
|
|
|
|
di.setTrust("0.9")
|
|
|
|
di.setProvenanceaction(
|
|
|
|
OafMapperUtils.qualifier(
|
|
|
|
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
|
|
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
|
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
|
|
ModelConstants.DNET_PROVENANCE_ACTIONS
|
|
|
|
)
|
|
|
|
)
|
2024-04-05 11:10:44 +02:00
|
|
|
if (magType == null || magType.orNull ==null) {
|
2024-04-03 17:07:14 +02:00
|
|
|
result = new Publication
|
|
|
|
result.setDataInfo(di)
|
2024-04-04 21:06:04 +02:00
|
|
|
val i = new Instance
|
|
|
|
i.setInstancetype(
|
|
|
|
qualifier(
|
|
|
|
"0038",
|
|
|
|
"Other literature type",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
)
|
2024-04-03 17:07:14 +02:00
|
|
|
|
|
|
|
result.setInstance(List(i).asJava)
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
val currentType: String = magType.get
|
|
|
|
|
|
|
|
val tp = currentType.toLowerCase match {
|
|
|
|
case "book" =>
|
|
|
|
result = new Publication
|
|
|
|
qualifier("0002", "Book", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)
|
|
|
|
case "bookchapter" =>
|
|
|
|
result = new Publication
|
|
|
|
qualifier(
|
|
|
|
"00013",
|
|
|
|
"Part of book or chapter of book",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
case "journal" =>
|
|
|
|
result = new Publication
|
|
|
|
qualifier("0043", "Journal", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE)
|
|
|
|
case "patent" =>
|
2024-04-05 11:10:44 +02:00
|
|
|
if (source!= null && source.orNull != null) {
|
2024-04-03 17:07:14 +02:00
|
|
|
val s = source.get.toLowerCase
|
|
|
|
if (s.contains("patent") || s.contains("brevet")) {
|
|
|
|
result = new Publication
|
|
|
|
qualifier(
|
|
|
|
"0019",
|
|
|
|
"Patent",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
} else if (s.contains("journal of")) {
|
|
|
|
result = new Publication
|
|
|
|
qualifier(
|
|
|
|
"0043",
|
|
|
|
"Journal",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
} else if (s.contains("proceedings") || s.contains("conference") || s.contains("workshop") || s.contains(
|
2024-04-04 21:06:04 +02:00
|
|
|
"symposium"
|
|
|
|
)) {
|
2024-04-03 17:07:14 +02:00
|
|
|
result = new Publication
|
|
|
|
qualifier(
|
|
|
|
"0001",
|
|
|
|
"Article",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
} else null
|
|
|
|
} else null
|
|
|
|
|
|
|
|
case "repository" =>
|
|
|
|
result = new Publication()
|
|
|
|
di.setInvisible(true)
|
|
|
|
qualifier(
|
|
|
|
"0038",
|
|
|
|
"Other literature type",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
|
|
|
|
|
|
|
case "thesis" =>
|
|
|
|
result = new Publication
|
|
|
|
qualifier(
|
2024-04-04 21:06:04 +02:00
|
|
|
"0044",
|
|
|
|
"Thesis",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
2024-04-03 17:07:14 +02:00
|
|
|
case "dataset" =>
|
|
|
|
result = new OafDataset
|
|
|
|
qualifier(
|
2024-04-04 21:06:04 +02:00
|
|
|
"0021",
|
|
|
|
"Dataset",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
2024-04-03 17:07:14 +02:00
|
|
|
case "conference" =>
|
|
|
|
result = new Publication
|
|
|
|
qualifier(
|
2024-04-04 21:06:04 +02:00
|
|
|
"0001",
|
|
|
|
"Article",
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
|
|
|
ModelConstants.DNET_PUBLICATION_RESOURCE
|
|
|
|
)
|
2024-04-03 17:07:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (result != null) {
|
|
|
|
result.setDataInfo(di)
|
2024-04-04 21:06:04 +02:00
|
|
|
val i = new Instance
|
2024-04-03 17:07:14 +02:00
|
|
|
i.setInstancetype(tp)
|
|
|
|
result.setInstance(List(i).asJava)
|
|
|
|
}
|
|
|
|
result
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2024-04-05 11:10:44 +02:00
|
|
|
def convertMAGtoOAF(paper: MAGPaper): String = {
|
2024-04-04 21:06:04 +02:00
|
|
|
|
|
|
|
// FILTER all the MAG paper with no URL
|
2024-04-05 11:10:44 +02:00
|
|
|
if (paper.urls.orNull == null )
|
2024-04-04 21:06:04 +02:00
|
|
|
return null
|
|
|
|
|
|
|
|
val result = createResultFromType(paper.docType, paper.originalVenue)
|
|
|
|
if (result == null)
|
|
|
|
return null
|
|
|
|
|
2024-04-05 11:10:44 +02:00
|
|
|
|
|
|
|
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
2024-04-04 21:06:04 +02:00
|
|
|
val pidList = List(
|
|
|
|
structuredProperty(
|
2024-04-05 11:10:44 +02:00
|
|
|
paper.paperId.get.toString,
|
2024-04-04 21:06:04 +02:00
|
|
|
qualifier(
|
|
|
|
PidType.mag_id.toString,
|
|
|
|
PidType.mag_id.toString,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES
|
|
|
|
),
|
|
|
|
null
|
|
|
|
)
|
2024-04-03 17:07:14 +02:00
|
|
|
)
|
2024-03-08 16:31:40 +01:00
|
|
|
|
2024-04-04 21:06:04 +02:00
|
|
|
result.setPid(pidList.asJava)
|
|
|
|
|
|
|
|
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
|
|
|
|
|
|
|
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
|
|
|
|
|
|
|
val originalTitles = structuredProperty(paper.paperTitle.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
|
|
|
|
|
|
|
result.setTitle(List(originalTitles).asJava)
|
|
|
|
|
|
|
|
if (paper.date.orNull != null) {
|
|
|
|
result.setDateofacceptance(field(paper.date.get, null))
|
|
|
|
} else {
|
|
|
|
if (paper.year.isDefined && paper.year.get > 1700) {
|
|
|
|
result.setDateofacceptance(field(s"${paper.year.get}-01-01", null))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (paper.onlineDate.orNull != null) {
|
|
|
|
result.setRelevantdate(
|
2024-03-08 16:31:40 +01:00
|
|
|
List(
|
|
|
|
structuredProperty(
|
2024-04-04 21:06:04 +02:00
|
|
|
paper.onlineDate.get,
|
2024-03-08 16:31:40 +01:00
|
|
|
qualifier(
|
2024-04-04 21:06:04 +02:00
|
|
|
"published-online",
|
|
|
|
"published-online",
|
|
|
|
ModelConstants.DNET_DATACITE_DATE,
|
|
|
|
ModelConstants.DNET_DATACITE_DATE
|
2024-03-08 16:31:40 +01:00
|
|
|
),
|
|
|
|
null
|
|
|
|
)
|
|
|
|
).asJava
|
|
|
|
)
|
2024-04-03 17:07:14 +02:00
|
|
|
}
|
2024-03-08 16:31:40 +01:00
|
|
|
|
2024-04-04 21:06:04 +02:00
|
|
|
if (paper.publisher.orNull != null) {
|
|
|
|
result.setPublisher(field(paper.publisher.get, null))
|
2024-04-03 17:07:14 +02:00
|
|
|
}
|
2024-03-08 16:31:40 +01:00
|
|
|
|
2024-04-03 17:07:14 +02:00
|
|
|
if (paper.date.isDefined)
|
2024-04-04 21:06:04 +02:00
|
|
|
result.setDateofacceptance(field(paper.date.get, null))
|
|
|
|
if (paper.onlineDate.orNull != null)
|
|
|
|
result.setRelevantdate(
|
|
|
|
List(
|
|
|
|
structuredProperty(
|
|
|
|
paper.onlineDate.get,
|
|
|
|
qualifier(
|
|
|
|
"published-online",
|
|
|
|
"published-online",
|
|
|
|
ModelConstants.DNET_DATACITE_DATE,
|
|
|
|
ModelConstants.DNET_DATACITE_DATE
|
|
|
|
),
|
|
|
|
null
|
|
|
|
)
|
|
|
|
).asJava
|
|
|
|
)
|
2024-04-03 17:07:14 +02:00
|
|
|
|
|
|
|
if (paper.publisher.isDefined)
|
2024-04-04 21:06:04 +02:00
|
|
|
result.setPublisher(field(paper.publisher.get, null))
|
2024-04-03 17:07:14 +02:00
|
|
|
|
|
|
|
if (paper.journalId.isDefined && paper.journalName.isDefined) {
|
|
|
|
val j = new Journal
|
|
|
|
|
|
|
|
j.setName(paper.journalName.get)
|
|
|
|
j.setSp(paper.firstPage.orNull)
|
|
|
|
j.setEp(paper.lastPage.orNull)
|
|
|
|
if (paper.publisher.isDefined)
|
2024-04-04 21:06:04 +02:00
|
|
|
result.setPublisher(field(paper.publisher.get, null))
|
2024-04-03 17:07:14 +02:00
|
|
|
j.setIssnPrinted(paper.journalIssn.orNull)
|
|
|
|
j.setVol(paper.volume.orNull)
|
|
|
|
j.setIss(paper.issue.orNull)
|
|
|
|
j.setConferenceplace(paper.conferenceLocation.orNull)
|
2024-04-04 21:06:04 +02:00
|
|
|
result match {
|
|
|
|
case publication: Publication => publication.setJournal(j)
|
|
|
|
}
|
2024-03-08 16:31:40 +01:00
|
|
|
}
|
|
|
|
|
2024-04-04 21:06:04 +02:00
|
|
|
if (paper.abstractText.isDefined)
|
|
|
|
result.setDescription(List(field(paper.abstractText.get, null)).asJava)
|
|
|
|
if (paper.authors.isDefined && paper.authors.get.nonEmpty) {
|
|
|
|
result.setAuthor(
|
|
|
|
paper.authors.get
|
|
|
|
.filter(a => a.AuthorName.isDefined)
|
|
|
|
.map(a => {
|
|
|
|
val author = new Author
|
|
|
|
author.setFullname(a.AuthorName.get)
|
|
|
|
author
|
|
|
|
})
|
|
|
|
.asJava
|
|
|
|
)
|
|
|
|
}
|
2024-04-03 17:07:14 +02:00
|
|
|
|
2024-04-04 21:06:04 +02:00
|
|
|
val instance = result.getInstance().get(0)
|
|
|
|
instance.setPid(pidList.asJava)
|
2024-04-05 11:10:44 +02:00
|
|
|
if(paper.doi.orNull != null)
|
2024-04-04 21:06:04 +02:00
|
|
|
instance.setAlternateIdentifier(
|
|
|
|
List(
|
|
|
|
structuredProperty(
|
|
|
|
paper.doi.get,
|
|
|
|
qualifier(
|
|
|
|
PidType.doi.toString,
|
|
|
|
PidType.doi.toString,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES
|
|
|
|
),
|
|
|
|
null
|
|
|
|
)
|
|
|
|
).asJava
|
|
|
|
)
|
|
|
|
instance.setUrl(paper.urls.get.asJava)
|
|
|
|
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
2024-04-05 11:10:44 +02:00
|
|
|
instance.setCollectedfrom(MAGCollectedFrom)
|
2024-04-04 21:06:04 +02:00
|
|
|
instance.setAccessright(accessRight(
|
|
|
|
ModelConstants.UNKNOWN,
|
|
|
|
ModelConstants.NOT_AVAILABLE,
|
|
|
|
ModelConstants.DNET_ACCESS_MODES,
|
|
|
|
ModelConstants.DNET_ACCESS_MODES
|
|
|
|
))
|
|
|
|
|
|
|
|
if (paper.authors.orNull != null && paper.authors.get.nonEmpty)
|
|
|
|
result.setAuthor(
|
|
|
|
paper.authors.get
|
|
|
|
.filter(a => a.AuthorName.orNull != null)
|
|
|
|
.map { a =>
|
|
|
|
val author = new Author
|
|
|
|
author.setFullname(a.AuthorName.get)
|
|
|
|
var authorPid = List(
|
|
|
|
structuredProperty(
|
|
|
|
a.AuthorId.get.toString,
|
|
|
|
qualifier(
|
|
|
|
PidType.mag_id.toString,
|
|
|
|
PidType.mag_id.toString,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES
|
|
|
|
),
|
|
|
|
null
|
|
|
|
)
|
|
|
|
)
|
|
|
|
if (a.GridId.orNull != null) {
|
|
|
|
authorPid = authorPid ::: List(
|
|
|
|
structuredProperty(
|
|
|
|
a.AuthorId.get.toString,
|
|
|
|
qualifier(
|
|
|
|
PidType.mag_id.toString,
|
|
|
|
PidType.mag_id.toString,
|
|
|
|
ModelConstants.DNET_PID_TYPES,
|
|
|
|
ModelConstants.DNET_PID_TYPES
|
|
|
|
),
|
|
|
|
null
|
|
|
|
)
|
|
|
|
)
|
|
|
|
}
|
|
|
|
author.setPid(authorPid.asJava)
|
|
|
|
author
|
|
|
|
}
|
|
|
|
.asJava
|
|
|
|
)
|
2024-04-05 11:10:44 +02:00
|
|
|
mapper.writeValueAsString(result)
|
2024-03-08 16:31:40 +01:00
|
|
|
}
|
|
|
|
|
2024-02-29 18:16:15 +01:00
|
|
|
def convertInvertedIndexString(json_input: String): String = {
|
|
|
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
|
|
|
lazy val json: json4s.JValue = parse(json_input)
|
|
|
|
val idl = (json \ "IndexLength").extract[Int]
|
|
|
|
if (idl > 0) {
|
|
|
|
val res = Array.ofDim[String](idl)
|
|
|
|
|
|
|
|
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
|
|
|
|
|
|
|
for { (k: String, v: List[Int]) <- iid } {
|
|
|
|
v.foreach(item => res(item) = k)
|
|
|
|
}
|
|
|
|
(0 until idl).foreach(i => {
|
|
|
|
if (res(i) == null)
|
|
|
|
res(i) = ""
|
|
|
|
})
|
|
|
|
return res.mkString(" ")
|
|
|
|
}
|
|
|
|
""
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|