diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 98e22d8a35..c89cc9d1dc 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -29,6 +29,13 @@ testCompile + + scala-doc + process-resources + + doc + + ${scala.version} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java index 8815284255..af0d5169d3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java @@ -5,94 +5,249 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; +/** + * This class represent an instance of Pubmed Article extracted from the native XML + * + * @author Sandro La Bruzzo + */ + public class PMArticle implements Serializable { + /** + * the Pubmed Identifier + */ private String pmid; + /** + * the DOI + */ private String doi; + /** + * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. + * All dates will have a , , and elements. Some may have an , , and element(s). + */ private String date; + /** + * This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself. + */ private PMJournal journal; + /** + * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the element. + */ private String title; + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + */ private String description; + /** + * the language in which an article was published is recorded in . + * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single + * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. + * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. + */ private String language; + + /** + * NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. * + */ private final List subjects = new ArrayList<>(); + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + */ private final List publicationTypes = new ArrayList<>(); + /** + * Personal and collective (corporate) author names published with the article are found in . + */ private List authors = new ArrayList<>(); - public List getPublicationTypes() { - return publicationTypes; - } - + /** + * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service + * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. + */ private final List grants = new ArrayList<>(); - public List getGrants() { - return grants; - } - + /** + * get the DOI + * @return a DOI + */ public String getDoi() { return doi; } + /** + * Set the DOI + * @param doi a DOI + */ public void setDoi(String doi) { this.doi = doi; } + /** + * get the Pubmed Identifier + * @return the PMID + */ public String getPmid() { return pmid; } + /** + * set the Pubmed Identifier + * @param pmid the Pubmed Identifier + */ public void setPmid(String pmid) { this.pmid = pmid; } + /** + * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. + * All dates will have a , , and elements. Some may have an , , and element(s). + * + * @return the Pubmed Date + */ public String getDate() { return date; } + /** + * Set the pubmed Date + * @param date + */ public void setDate(String date) { this.date = date; } + /** + * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. + * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. + * The NLM journal title abbreviation is exported in the element. + * + * @return the pubmed Journal Extracted + */ public PMJournal getJournal() { return journal; } + /** + * Set the mapped pubmed Journal + * @param journal + */ public void setJournal(PMJournal journal) { this.journal = journal; } + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + * + * @return the extracted pubmed Title + */ public String getTitle() { return title; } + /** + * set the pubmed title + * @param title + */ public void setTitle(String title) { this.title = title; } + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + * + * @return the Mapped Pubmed Article Abstracts + */ public String getDescription() { return description; } + /** + * Set the Mapped Pubmed Article Abstracts + * @param description + */ public void setDescription(String description) { this.description = description; } + /** + * Personal and collective (corporate) author names published with the article are found in . + * + * @return get the Mapped Authors lists + */ public List getAuthors() { return authors; } + /** + * Set the Mapped Authors lists + * @param authors + */ public void setAuthors(List authors) { this.authors = authors; } + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + * + * @return the mapped Subjects + */ public List getSubjects() { return subjects; } + /** + * + * the language in which an article was published is recorded in . + * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single + * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. + * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. + * + * @return The mapped Language + */ public String getLanguage() { return language; } + /** + * + * Set The mapped Language + * + * @param language the mapped Language + */ public void setLanguage(String language) { this.language = language; } + + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + * + * @return the mapped Publication Type + */ + public List getPublicationTypes() { + return publicationTypes; + } + + /** + * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service + * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. + * @return the mapped grants + */ + + public List getGrants() { + return grants; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java index cef92d0031..68ef6459e0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java @@ -3,27 +3,57 @@ package eu.dnetlib.dhp.sx.bio.pubmed; import java.io.Serializable; +/** + * The type Pubmed author. + * + * @author Sandro La Bruzzo + */ public class PMAuthor implements Serializable { private String lastName; private String foreName; + /** + * Gets last name. + * + * @return the last name + */ public String getLastName() { return lastName; } + /** + * Sets last name. + * + * @param lastName the last name + */ public void setLastName(String lastName) { this.lastName = lastName; } + /** + * Gets fore name. + * + * @return the fore name + */ public String getForeName() { return foreName; } + /** + * Sets fore name. + * + * @param foreName the fore name + */ public void setForeName(String foreName) { this.foreName = foreName; } + /** + * Gets full name. + * + * @return the full name + */ public String getFullName() { return String .format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : ""); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java index ce9420cc13..abb9084834 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java @@ -1,41 +1,86 @@ package eu.dnetlib.dhp.sx.bio.pubmed; +/** + * The type Pm grant. + * + * @author Sandro La Bruzzo + */ public class PMGrant { private String grantID; private String agency; private String country; + /** + * Instantiates a new Pm grant. + */ public PMGrant() { } + /** + * Instantiates a new Pm grant. + * + * @param grantID the grant id + * @param agency the agency + * @param country the country + */ public PMGrant(String grantID, String agency, String country) { this.grantID = grantID; this.agency = agency; this.country = country; } + /** + * Gets grant id. + * + * @return the grant id + */ public String getGrantID() { return grantID; } + /** + * Sets grant id. + * + * @param grantID the grant id + */ public void setGrantID(String grantID) { this.grantID = grantID; } + /** + * Gets agency. + * + * @return the agency + */ public String getAgency() { return agency; } + /** + * Sets agency. + * + * @param agency the agency + */ public void setAgency(String agency) { this.agency = agency; } + /** + * Gets country. + * + * @return the country + */ public String getCountry() { return country; } + /** + * Sets country. + * + * @param country the country + */ public void setCountry(String country) { this.country = country; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java index 863a23bd50..731648839e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java @@ -3,6 +3,11 @@ package eu.dnetlib.dhp.sx.bio.pubmed; import java.io.Serializable; +/** + * The type Pm journal. + * + * @author Sandro La Bruzzo + */ public class PMJournal implements Serializable { private String issn; @@ -11,42 +16,92 @@ public class PMJournal implements Serializable { private String date; private String title; + /** + * Gets issn. + * + * @return the issn + */ public String getIssn() { return issn; } + /** + * Sets issn. + * + * @param issn the issn + */ public void setIssn(String issn) { this.issn = issn; } + /** + * Gets volume. + * + * @return the volume + */ public String getVolume() { return volume; } + /** + * Sets volume. + * + * @param volume the volume + */ public void setVolume(String volume) { this.volume = volume; } + /** + * Gets issue. + * + * @return the issue + */ public String getIssue() { return issue; } + /** + * Sets issue. + * + * @param issue the issue + */ public void setIssue(String issue) { this.issue = issue; } + /** + * Gets date. + * + * @return the date + */ public String getDate() { return date; } + /** + * Sets date. + * + * @param date the date + */ public void setDate(String date) { this.date = date; } + /** + * Gets title. + * + * @return the title + */ public String getTitle() { return title; } + /** + * Sets title. + * + * @param title the title + */ public void setTitle(String title) { this.title = title; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala index 80cb0667cb..c6d5fdf74c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -2,6 +2,12 @@ package eu.dnetlib.dhp.sx.bio.pubmed import scala.xml.MetaData import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} + + +/** + * + * @param xml + */ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { var currentArticle:PMArticle = generateNextArticle() diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java index 862d39a940..e3829bb7be 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java @@ -1,40 +1,83 @@ package eu.dnetlib.dhp.sx.bio.pubmed; +/** + * The type Pubmed subject. + */ public class PMSubject { private String value; private String meshId; private String registryNumber; + /** + * Instantiates a new Pm subject. + */ public PMSubject() { } + /** + * Instantiates a new Pm subject. + * + * @param value the value + * @param meshId the mesh id + * @param registryNumber the registry number + */ public PMSubject(String value, String meshId, String registryNumber) { this.value = value; this.meshId = meshId; this.registryNumber = registryNumber; } + /** + * Gets value. + * + * @return the value + */ public String getValue() { return value; } + /** + * Sets value. + * + * @param value the value + */ public void setValue(String value) { this.value = value; } + /** + * Gets mesh id. + * + * @return the mesh id + */ public String getMeshId() { return meshId; } + /** + * Sets mesh id. + * + * @param meshId the mesh id + */ public void setMeshId(String meshId) { this.meshId = meshId; } + /** + * Gets registry number. + * + * @return the registry number + */ public String getRegistryNumber() { return registryNumber; } + /** + * Sets registry number. + * + * @param registryNumber the registry number + */ public void setRegistryNumber(String registryNumber) { this.registryNumber = registryNumber; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 13f38408ed..ecef322020 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -8,6 +8,9 @@ import scala.collection.JavaConverters._ import java.util.regex.Pattern +/** + * + */ object PubMedToOaf { val SUBJ_CLASS = "keywords" @@ -15,7 +18,17 @@ object PubMedToOaf { "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "doi" -> "https://dx.doi.org/" ) + val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") + val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + + + /** + * Cleaning the DOI Applying regex in order to + * remove doi starting with URL + * @param doi input DOI + * @return cleaned DOI + */ def cleanDoi(doi: String): String = { val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$" @@ -30,6 +43,15 @@ object PubMedToOaf { null } + /** + * + * Create an instance of class extends Result + * starting from OAF instanceType value + * + * @param cobjQualifier OAF instance type + * @param vocabularies All dnet vocabularies + * @return the correct instance + */ def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) result_typologies.getClassid match { @@ -42,6 +64,12 @@ object PubMedToOaf { } } + /** + * Mapping the Pubmedjournal info into the OAF Journale + * + * @param j the pubmedJournal + * @return the OAF Journal + */ def mapJournal(j: PMJournal): Journal = { if (j == null) return null @@ -49,6 +77,7 @@ object PubMedToOaf { journal.setDataInfo(dataInfo) journal.setName(j.getTitle) + journal.setConferencedate(j.getDate) journal.setVol(j.getVolume) journal.setIssnPrinted(j.getIssn) journal.setIss(j.getIssue) @@ -57,25 +86,43 @@ object PubMedToOaf { } - + /** + * + * Find vocabulary term into synonyms and term in the vocabulary + * + * @param vocabularyName the input vocabulary name + * @param vocabularies all the vocabularies + * @param term the term to find + * + * @return the cleaned term value + */ def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { val a = vocabularies.getSynonymAsQualifier(vocabularyName, term) val b = vocabularies.getTermAsQualifier(vocabularyName, term) if (a == null) b else a } - val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") - val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + /** + * Map the Pubmed Article into the OAF instance + * + * + * @param article the pubmed articles + * @param vocabularies the vocabularies + * @return The OAF instance if the mapping did not fail + */ def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = { if (article.getPublicationTypes == null) return null - val i = new Instance + + + // MAP PMID into pid with classid = classname = pmid val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) if (pidList == null) return null + // MAP //ArticleId[./@IdType="doi"] into alternateIdentifier with classid = classname = doi var alternateIdentifier: StructuredProperty = null if (article.getDoi != null) { val normalizedPid = cleanDoi(article.getDoi) @@ -83,43 +130,64 @@ object PubMedToOaf { alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo) } + // INSTANCE MAPPING + //-------------------------------------------------------------------------------------- + // If the article contains the typology Journal Article then we apply this type //else We have to find a terms that match the vocabulary otherwise we discard it val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) + val pubmedInstance = new Instance if (ja.isDefined) { val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) - i.setInstancetype(cojbCategory) + pubmedInstance.setInstancetype(cojbCategory) } else { val i_type = article.getPublicationTypes.asScala .map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue)) .find(q => q != null) if (i_type.isDefined) - i.setInstancetype(i_type.get) + pubmedInstance.setInstancetype(i_type.get) else return null } - val result = createResult(i.getInstancetype, vocabularies) + val result = createResult(pubmedInstance.getInstancetype, vocabularies) if (result == null) return result result.setDataInfo(dataInfo) - i.setPid(pidList.asJava) + pubmedInstance.setPid(pidList.asJava) if (alternateIdentifier != null) - i.setAlternateIdentifier(List(alternateIdentifier).asJava) - result.setInstance(List(i).asJava) - i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut) + pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava) + result.setInstance(List(pubmedInstance).asJava) + pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut) + //CREATE URL From pmid val urlLists: List[String] = pidList .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .filter(t => t._1.nonEmpty) .map(t => t._1 + t._2) if (urlLists != null) - i.setUrl(urlLists.asJava) - i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) - i.setCollectedfrom(collectedFrom) + pubmedInstance.setUrl(urlLists.asJava) + + //ASSIGN DateofAcceptance + pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) + //ASSIGN COLLECTEDFROM + pubmedInstance.setCollectedfrom(collectedFrom) result.setPid(pidList.asJava) + + //END INSTANCE MAPPING + //-------------------------------------------------------------------------------------- + + + // JOURNAL MAPPING + //-------------------------------------------------------------------------------------- if (article.getJournal != null && result.isInstanceOf[Publication]) result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal)) result.setCollectedfrom(List(collectedFrom).asJava) + //END JOURNAL MAPPING + //-------------------------------------------------------------------------------------- + + + // RESULT MAPPING + //-------------------------------------------------------------------------------------- result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) if (article.getTitle == null || article.getTitle.isEmpty) @@ -159,6 +227,9 @@ object PubMedToOaf { result.setId(article.getPmid) + + // END RESULT MAPPING + //-------------------------------------------------------------------------------------- val id = IdentifierFactory.createIdentifier(result) if (article.getPmid.equalsIgnoreCase(id)) return null diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md b/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/index.md b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md new file mode 100644 index 0000000000..c0c7560826 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md @@ -0,0 +1,9 @@ +##DHP-Aggregation + +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. + +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. + +It defines [mappings](mappings.md) for transformation of different datasource (See mapping section). \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md b/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md new file mode 100644 index 0000000000..9da46a27e0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md @@ -0,0 +1,7 @@ +##DHP-Aggregation + +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. + +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md b/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md new file mode 100644 index 0000000000..576c4b6bee --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md @@ -0,0 +1,18 @@ +DHP Aggregation +=============== + +DHP-Aggregations contains different mappings from original data format into OAF Data Format, +which converge in the graph in different ways: + +- Via Action Manager +- Direct in the MdStore on Hadoop + +Below the list of the implemented mapping + + +Mappings +======= + +1. [PubMed](pubmed.md) +2. [Datacite](datacite.md) + diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md new file mode 100644 index 0000000000..f6327a51b1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md @@ -0,0 +1,62 @@ +#Pubmed Mapping +This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/). + +Collection +--------- +The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with +the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html) + + +Parsing +------- +The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates +an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html) + + +Mapping +------- + +The table below describes the mapping from the XML Native to the OAF mapping + + + + + +| Xpath Source | Oaf Field | Notes | +| ----------- | ----------- | ----------- | +| //PMID | pid | classid = classname = pmid +| | **Instance Mapping** | | +|//PublicationType | InstanceType | If the article contains the typology **Journal Article** then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it +|//PMID | instance/PID | Map the pmid also in the pid in the instance | +| //ArticleId[./@IdType="doi" | instance/alternateIdentifier |classid = classname = doi +|//PMID | instance/URL | prepend to the PMId the base url https://pubmed.ncbi.nlm.nih.gov/ +| //PubmedPubDate | instance/Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it +| FOR ALL INSTANCE | CollectedFrom | datasourceName: *Europe PubMed Central* DatasourceId: +| | **Journal Mapping** | | +|//Journal/PubDate| Journal/Conferencedate | map the date of the Journal +|//Journal/Title| Journal/Name | | +|//Journal/Volume| Journal/Vol | | +|//Journal/ISSN| Journal/issPrinted | | +|//Journal/Issue| Journal/Iss | | +| | **Publication Mapping** | | +| //PubmedPubDate | Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it +| //Title | title | with qualifier ModelConstants.MAIN_TITLE_QUALIFIER +| //AbstractText | Description || +|//Language| Language| cleaning vocabulary -> dnet:languages +|//DescriptorName| Subject | classId, className = keyword +| | **Author Mapping** | | +|//Author/LastName| author.Surname| | +|//Author/ForeName| author.Forename| | +|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist | +|FOR ALL AUTHOR | author.rank| sequential number starting from 1| + + + + + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png b/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png new file mode 100644 index 0000000000..00d320c39b Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png differ diff --git a/dhp-workflows/dhp-aggregation/src/site/site.xml b/dhp-workflows/dhp-aggregation/src/site/site.xml new file mode 100644 index 0000000000..da5da0f1e7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/site.xml @@ -0,0 +1,32 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index 02bc5d8d4e..d8773642ee 100644 --- a/pom.xml +++ b/pom.xml @@ -550,7 +550,7 @@ org.apache.maven.plugins maven-site-plugin - 3.7.1 + 3.9.1