This commit is contained in:
Claudio Atzori 2021-11-15 14:42:49 +01:00
commit 941a50a2fc
16 changed files with 563 additions and 23 deletions

View File

@ -29,6 +29,13 @@
<goal>testCompile</goal> <goal>testCompile</goal>
</goals> </goals>
</execution> </execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions> </executions>
<configuration> <configuration>
<scalaVersion>${scala.version}</scalaVersion> <scalaVersion>${scala.version}</scalaVersion>

View File

@ -5,94 +5,249 @@ import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
/**
* This class represent an instance of Pubmed Article extracted from the native XML
*
* @author Sandro La Bruzzo
*/
public class PMArticle implements Serializable { public class PMArticle implements Serializable {
/**
* the Pubmed Identifier
*/
private String pmid; private String pmid;
/**
* the DOI
*/
private String doi; private String doi;
/**
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
*/
private String date; private String date;
/**
* This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
*/
private PMJournal journal; private PMJournal journal;
/**
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
*/
private String title; private String title;
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*/
private String description; private String description;
/**
* the language in which an article was published is recorded in <Language>.
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
*/
private String language; private String language;
/**
* NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. *
*/
private final List<PMSubject> subjects = new ArrayList<>(); private final List<PMSubject> subjects = new ArrayList<>();
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*/
private final List<PMSubject> publicationTypes = new ArrayList<>(); private final List<PMSubject> publicationTypes = new ArrayList<>();
/**
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
*/
private List<PMAuthor> authors = new ArrayList<>(); private List<PMAuthor> authors = new ArrayList<>();
public List<PMSubject> getPublicationTypes() { /**
return publicationTypes; * <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
} * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
*/
private final List<PMGrant> grants = new ArrayList<>(); private final List<PMGrant> grants = new ArrayList<>();
public List<PMGrant> getGrants() { /**
return grants; * get the DOI
} * @return a DOI
*/
public String getDoi() { public String getDoi() {
return doi; return doi;
} }
/**
* Set the DOI
* @param doi a DOI
*/
public void setDoi(String doi) { public void setDoi(String doi) {
this.doi = doi; this.doi = doi;
} }
/**
* get the Pubmed Identifier
* @return the PMID
*/
public String getPmid() { public String getPmid() {
return pmid; return pmid;
} }
/**
* set the Pubmed Identifier
* @param pmid the Pubmed Identifier
*/
public void setPmid(String pmid) { public void setPmid(String pmid) {
this.pmid = pmid; this.pmid = pmid;
} }
/**
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
*
* @return the Pubmed Date
*/
public String getDate() { public String getDate() {
return date; return date;
} }
/**
* Set the pubmed Date
* @param date
*/
public void setDate(String date) { public void setDate(String date) {
this.date = date; this.date = date;
} }
/**
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
*
* @return the pubmed Journal Extracted
*/
public PMJournal getJournal() { public PMJournal getJournal() {
return journal; return journal;
} }
/**
* Set the mapped pubmed Journal
* @param journal
*/
public void setJournal(PMJournal journal) { public void setJournal(PMJournal journal) {
this.journal = journal; this.journal = journal;
} }
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*
* @return the extracted pubmed Title
*/
public String getTitle() { public String getTitle() {
return title; return title;
} }
/**
* set the pubmed title
* @param title
*/
public void setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
} }
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*
* @return the Mapped Pubmed Article Abstracts
*/
public String getDescription() { public String getDescription() {
return description; return description;
} }
/**
* Set the Mapped Pubmed Article Abstracts
* @param description
*/
public void setDescription(String description) { public void setDescription(String description) {
this.description = description; this.description = description;
} }
/**
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
*
* @return get the Mapped Authors lists
*/
public List<PMAuthor> getAuthors() { public List<PMAuthor> getAuthors() {
return authors; return authors;
} }
/**
* Set the Mapped Authors lists
* @param authors
*/
public void setAuthors(List<PMAuthor> authors) { public void setAuthors(List<PMAuthor> authors) {
this.authors = authors; this.authors = authors;
} }
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*
* @return the mapped Subjects
*/
public List<PMSubject> getSubjects() { public List<PMSubject> getSubjects() {
return subjects; return subjects;
} }
/**
*
* the language in which an article was published is recorded in <Language>.
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
*
* @return The mapped Language
*/
public String getLanguage() { public String getLanguage() {
return language; return language;
} }
/**
*
* Set The mapped Language
*
* @param language the mapped Language
*/
public void setLanguage(String language) { public void setLanguage(String language) {
this.language = language; this.language = language;
} }
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*
* @return the mapped Publication Type
*/
public List<PMSubject> getPublicationTypes() {
return publicationTypes;
}
/**
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
* @return the mapped grants
*/
public List<PMGrant> getGrants() {
return grants;
}
} }

View File

@ -3,27 +3,57 @@ package eu.dnetlib.dhp.sx.bio.pubmed;
import java.io.Serializable; import java.io.Serializable;
/**
* The type Pubmed author.
*
* @author Sandro La Bruzzo
*/
public class PMAuthor implements Serializable { public class PMAuthor implements Serializable {
private String lastName; private String lastName;
private String foreName; private String foreName;
/**
* Gets last name.
*
* @return the last name
*/
public String getLastName() { public String getLastName() {
return lastName; return lastName;
} }
/**
* Sets last name.
*
* @param lastName the last name
*/
public void setLastName(String lastName) { public void setLastName(String lastName) {
this.lastName = lastName; this.lastName = lastName;
} }
/**
* Gets fore name.
*
* @return the fore name
*/
public String getForeName() { public String getForeName() {
return foreName; return foreName;
} }
/**
* Sets fore name.
*
* @param foreName the fore name
*/
public void setForeName(String foreName) { public void setForeName(String foreName) {
this.foreName = foreName; this.foreName = foreName;
} }
/**
* Gets full name.
*
* @return the full name
*/
public String getFullName() { public String getFullName() {
return String return String
.format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : ""); .format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : "");

View File

@ -1,41 +1,86 @@
package eu.dnetlib.dhp.sx.bio.pubmed; package eu.dnetlib.dhp.sx.bio.pubmed;
/**
* The type Pm grant.
*
* @author Sandro La Bruzzo
*/
public class PMGrant { public class PMGrant {
private String grantID; private String grantID;
private String agency; private String agency;
private String country; private String country;
/**
* Instantiates a new Pm grant.
*/
public PMGrant() { public PMGrant() {
} }
/**
* Instantiates a new Pm grant.
*
* @param grantID the grant id
* @param agency the agency
* @param country the country
*/
public PMGrant(String grantID, String agency, String country) { public PMGrant(String grantID, String agency, String country) {
this.grantID = grantID; this.grantID = grantID;
this.agency = agency; this.agency = agency;
this.country = country; this.country = country;
} }
/**
* Gets grant id.
*
* @return the grant id
*/
public String getGrantID() { public String getGrantID() {
return grantID; return grantID;
} }
/**
* Sets grant id.
*
* @param grantID the grant id
*/
public void setGrantID(String grantID) { public void setGrantID(String grantID) {
this.grantID = grantID; this.grantID = grantID;
} }
/**
* Gets agency.
*
* @return the agency
*/
public String getAgency() { public String getAgency() {
return agency; return agency;
} }
/**
* Sets agency.
*
* @param agency the agency
*/
public void setAgency(String agency) { public void setAgency(String agency) {
this.agency = agency; this.agency = agency;
} }
/**
* Gets country.
*
* @return the country
*/
public String getCountry() { public String getCountry() {
return country; return country;
} }
/**
* Sets country.
*
* @param country the country
*/
public void setCountry(String country) { public void setCountry(String country) {
this.country = country; this.country = country;
} }

View File

@ -3,6 +3,11 @@ package eu.dnetlib.dhp.sx.bio.pubmed;
import java.io.Serializable; import java.io.Serializable;
/**
* The type Pm journal.
*
* @author Sandro La Bruzzo
*/
public class PMJournal implements Serializable { public class PMJournal implements Serializable {
private String issn; private String issn;
@ -11,42 +16,92 @@ public class PMJournal implements Serializable {
private String date; private String date;
private String title; private String title;
/**
* Gets issn.
*
* @return the issn
*/
public String getIssn() { public String getIssn() {
return issn; return issn;
} }
/**
* Sets issn.
*
* @param issn the issn
*/
public void setIssn(String issn) { public void setIssn(String issn) {
this.issn = issn; this.issn = issn;
} }
/**
* Gets volume.
*
* @return the volume
*/
public String getVolume() { public String getVolume() {
return volume; return volume;
} }
/**
* Sets volume.
*
* @param volume the volume
*/
public void setVolume(String volume) { public void setVolume(String volume) {
this.volume = volume; this.volume = volume;
} }
/**
* Gets issue.
*
* @return the issue
*/
public String getIssue() { public String getIssue() {
return issue; return issue;
} }
/**
* Sets issue.
*
* @param issue the issue
*/
public void setIssue(String issue) { public void setIssue(String issue) {
this.issue = issue; this.issue = issue;
} }
/**
* Gets date.
*
* @return the date
*/
public String getDate() { public String getDate() {
return date; return date;
} }
/**
* Sets date.
*
* @param date the date
*/
public void setDate(String date) { public void setDate(String date) {
this.date = date; this.date = date;
} }
/**
* Gets title.
*
* @return the title
*/
public String getTitle() { public String getTitle() {
return title; return title;
} }
/**
* Sets title.
*
* @param title the title
*/
public void setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
} }

View File

@ -2,6 +2,12 @@ package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
/**
*
* @param xml
*/
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
var currentArticle:PMArticle = generateNextArticle() var currentArticle:PMArticle = generateNextArticle()

View File

@ -1,40 +1,83 @@
package eu.dnetlib.dhp.sx.bio.pubmed; package eu.dnetlib.dhp.sx.bio.pubmed;
/**
* The type Pubmed subject.
*/
public class PMSubject { public class PMSubject {
private String value; private String value;
private String meshId; private String meshId;
private String registryNumber; private String registryNumber;
/**
* Instantiates a new Pm subject.
*/
public PMSubject() { public PMSubject() {
} }
/**
* Instantiates a new Pm subject.
*
* @param value the value
* @param meshId the mesh id
* @param registryNumber the registry number
*/
public PMSubject(String value, String meshId, String registryNumber) { public PMSubject(String value, String meshId, String registryNumber) {
this.value = value; this.value = value;
this.meshId = meshId; this.meshId = meshId;
this.registryNumber = registryNumber; this.registryNumber = registryNumber;
} }
/**
* Gets value.
*
* @return the value
*/
public String getValue() { public String getValue() {
return value; return value;
} }
/**
* Sets value.
*
* @param value the value
*/
public void setValue(String value) { public void setValue(String value) {
this.value = value; this.value = value;
} }
/**
* Gets mesh id.
*
* @return the mesh id
*/
public String getMeshId() { public String getMeshId() {
return meshId; return meshId;
} }
/**
* Sets mesh id.
*
* @param meshId the mesh id
*/
public void setMeshId(String meshId) { public void setMeshId(String meshId) {
this.meshId = meshId; this.meshId = meshId;
} }
/**
* Gets registry number.
*
* @return the registry number
*/
public String getRegistryNumber() { public String getRegistryNumber() {
return registryNumber; return registryNumber;
} }
/**
* Sets registry number.
*
* @param registryNumber the registry number
*/
public void setRegistryNumber(String registryNumber) { public void setRegistryNumber(String registryNumber) {
this.registryNumber = registryNumber; this.registryNumber = registryNumber;
} }

View File

@ -8,6 +8,9 @@ import scala.collection.JavaConverters._
import java.util.regex.Pattern import java.util.regex.Pattern
/**
*
*/
object PubMedToOaf { object PubMedToOaf {
val SUBJ_CLASS = "keywords" val SUBJ_CLASS = "keywords"
@ -15,7 +18,17 @@ object PubMedToOaf {
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
"doi" -> "https://dx.doi.org/" "doi" -> "https://dx.doi.org/"
) )
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/**
* Cleaning the DOI Applying regex in order to
* remove doi starting with URL
* @param doi input DOI
* @return cleaned DOI
*/
def cleanDoi(doi: String): String = { def cleanDoi(doi: String): String = {
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$" val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
@ -30,6 +43,15 @@ object PubMedToOaf {
null null
} }
/**
*
* Create an instance of class extends Result
* starting from OAF instanceType value
*
* @param cobjQualifier OAF instance type
* @param vocabularies All dnet vocabularies
* @return the correct instance
*/
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
result_typologies.getClassid match { result_typologies.getClassid match {
@ -42,6 +64,12 @@ object PubMedToOaf {
} }
} }
/**
* Mapping the Pubmedjournal info into the OAF Journale
*
* @param j the pubmedJournal
* @return the OAF Journal
*/
def mapJournal(j: PMJournal): Journal = { def mapJournal(j: PMJournal): Journal = {
if (j == null) if (j == null)
return null return null
@ -49,6 +77,7 @@ object PubMedToOaf {
journal.setDataInfo(dataInfo) journal.setDataInfo(dataInfo)
journal.setName(j.getTitle) journal.setName(j.getTitle)
journal.setConferencedate(j.getDate)
journal.setVol(j.getVolume) journal.setVol(j.getVolume)
journal.setIssnPrinted(j.getIssn) journal.setIssnPrinted(j.getIssn)
journal.setIss(j.getIssue) journal.setIss(j.getIssue)
@ -57,25 +86,43 @@ object PubMedToOaf {
} }
/**
*
* Find vocabulary term into synonyms and term in the vocabulary
*
* @param vocabularyName the input vocabulary name
* @param vocabularies all the vocabularies
* @param term the term to find
*
* @return the cleaned term value
*/
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term) val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
val b = vocabularies.getTermAsQualifier(vocabularyName, term) val b = vocabularies.getTermAsQualifier(vocabularyName, term)
if (a == null) b else a if (a == null) b else a
} }
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/**
* Map the Pubmed Article into the OAF instance
*
*
* @param article the pubmed articles
* @param vocabularies the vocabularies
* @return The OAF instance if the mapping did not fail
*/
def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = { def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = {
if (article.getPublicationTypes == null) if (article.getPublicationTypes == null)
return null return null
val i = new Instance
// MAP PMID into pid with classid = classname = pmid
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
if (pidList == null) if (pidList == null)
return null return null
// MAP //ArticleId[./@IdType="doi"] into alternateIdentifier with classid = classname = doi
var alternateIdentifier: StructuredProperty = null var alternateIdentifier: StructuredProperty = null
if (article.getDoi != null) { if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi) val normalizedPid = cleanDoi(article.getDoi)
@ -83,43 +130,64 @@ object PubMedToOaf {
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo) alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
} }
// INSTANCE MAPPING
//--------------------------------------------------------------------------------------
// If the article contains the typology Journal Article then we apply this type // If the article contains the typology Journal Article then we apply this type
//else We have to find a terms that match the vocabulary otherwise we discard it //else We have to find a terms that match the vocabulary otherwise we discard it
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
val pubmedInstance = new Instance
if (ja.isDefined) { if (ja.isDefined) {
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
i.setInstancetype(cojbCategory) pubmedInstance.setInstancetype(cojbCategory)
} else { } else {
val i_type = article.getPublicationTypes.asScala val i_type = article.getPublicationTypes.asScala
.map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue)) .map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
.find(q => q != null) .find(q => q != null)
if (i_type.isDefined) if (i_type.isDefined)
i.setInstancetype(i_type.get) pubmedInstance.setInstancetype(i_type.get)
else else
return null return null
} }
val result = createResult(i.getInstancetype, vocabularies) val result = createResult(pubmedInstance.getInstancetype, vocabularies)
if (result == null) if (result == null)
return result return result
result.setDataInfo(dataInfo) result.setDataInfo(dataInfo)
i.setPid(pidList.asJava) pubmedInstance.setPid(pidList.asJava)
if (alternateIdentifier != null) if (alternateIdentifier != null)
i.setAlternateIdentifier(List(alternateIdentifier).asJava) pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(i).asJava) result.setInstance(List(pubmedInstance).asJava)
i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut) pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
//CREATE URL From pmid
val urlLists: List[String] = pidList val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
.filter(t => t._1.nonEmpty) .filter(t => t._1.nonEmpty)
.map(t => t._1 + t._2) .map(t => t._1 + t._2)
if (urlLists != null) if (urlLists != null)
i.setUrl(urlLists.asJava) pubmedInstance.setUrl(urlLists.asJava)
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
i.setCollectedfrom(collectedFrom) //ASSIGN DateofAcceptance
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
//ASSIGN COLLECTEDFROM
pubmedInstance.setCollectedfrom(collectedFrom)
result.setPid(pidList.asJava) result.setPid(pidList.asJava)
//END INSTANCE MAPPING
//--------------------------------------------------------------------------------------
// JOURNAL MAPPING
//--------------------------------------------------------------------------------------
if (article.getJournal != null && result.isInstanceOf[Publication]) if (article.getJournal != null && result.isInstanceOf[Publication])
result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal)) result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal))
result.setCollectedfrom(List(collectedFrom).asJava) result.setCollectedfrom(List(collectedFrom).asJava)
//END JOURNAL MAPPING
//--------------------------------------------------------------------------------------
// RESULT MAPPING
//--------------------------------------------------------------------------------------
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
if (article.getTitle == null || article.getTitle.isEmpty) if (article.getTitle == null || article.getTitle.isEmpty)
@ -159,6 +227,9 @@ object PubMedToOaf {
result.setId(article.getPmid) result.setId(article.getPmid)
// END RESULT MAPPING
//--------------------------------------------------------------------------------------
val id = IdentifierFactory.createIdentifier(result) val id = IdentifierFactory.createIdentifier(result)
if (article.getPmid.equalsIgnoreCase(id)) if (article.getPmid.equalsIgnoreCase(id))
return null return null

View File

@ -0,0 +1,9 @@
##DHP-Aggregation
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).

View File

@ -0,0 +1,7 @@
##DHP-Aggregation
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.

View File

@ -0,0 +1,18 @@
DHP Aggregation
===============
DHP-Aggregations contains different mappings from original data format into OAF Data Format,
which converge in the graph in different ways:
- Via Action Manager
- Direct in the MdStore on Hadoop
Below the list of the implemented mapping
Mappings
=======
1. [PubMed](pubmed.md)
2. [Datacite](datacite.md)

View File

@ -0,0 +1,62 @@
#Pubmed Mapping
This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/).
Collection
---------
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
Parsing
-------
The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
Mapping
-------
The table below describes the mapping from the XML Native to the OAF mapping
| Xpath Source | Oaf Field | Notes |
| ----------- | ----------- | ----------- |
| //PMID | pid | classid = classname = pmid
| | **Instance Mapping** | |
|//PublicationType | InstanceType | If the article contains the typology **Journal Article** then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it
|//PMID | instance/PID | Map the pmid also in the pid in the instance |
| //ArticleId[./@IdType="doi" | instance/alternateIdentifier |classid = classname = doi
|//PMID | instance/URL | prepend to the PMId the base url https://pubmed.ncbi.nlm.nih.gov/
| //PubmedPubDate | instance/Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it
| FOR ALL INSTANCE | CollectedFrom | datasourceName: *Europe PubMed Central* DatasourceId:
| | **Journal Mapping** | |
|//Journal/PubDate| Journal/Conferencedate | map the date of the Journal
|//Journal/Title| Journal/Name | |
|//Journal/Volume| Journal/Vol | |
|//Journal/ISSN| Journal/issPrinted | |
|//Journal/Issue| Journal/Iss | |
| | **Publication Mapping** | |
| //PubmedPubDate | Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it
| //Title | title | with qualifier ModelConstants.MAIN_TITLE_QUALIFIER
| //AbstractText | Description ||
|//Language| Language| cleaning vocabulary -> dnet:languages
|//DescriptorName| Subject | classId, className = keyword
| | **Author Mapping** | |
|//Author/LastName| author.Surname| |
|//Author/ForeName| author.Forename| |
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|FOR ALL AUTHOR | author.rank| sequential number starting from 1|

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

View File

@ -0,0 +1,32 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu name="Documentation">
<item name="Mappings" href="mappings.html" collapse="true">
<item name="Pubmed" href="pubmed.html"/>
<item name="Datacite" href="datacite.html"/>
</item>
<item name="Release Notes" href="release-notes.html" />
<item name="General Information" href="about.html"/>
<item name="JavaDoc" href="apidocs/" />
<item name="ScalaDoc" href="scaladocs/" />
</menu>
<menu ref="reports"/>
</body>
</project>

View File

@ -550,7 +550,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId> <artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version> <version>3.9.1</version>
</plugin> </plugin>
<plugin> <plugin>