diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java index af0d5169d..9287a8cdd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java @@ -17,6 +17,9 @@ public class PMArticle implements Serializable { * the Pubmed Identifier */ private String pmid; + + private String pmcId; + /** * the DOI */ @@ -122,7 +125,7 @@ public class PMArticle implements Serializable { /** * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. - * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. + * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. * The NLM journal title abbreviation is exported in the element. * * @return the pubmed Journal Extracted @@ -140,10 +143,11 @@ public class PMArticle implements Serializable { } /** - * English-language abstracts are taken directly from the published article. - * If the article does not have a published abstract, the National Library of Medicine does not create one, - * thus the record lacks the and elements. However, in the absence of a formally - * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + * contains the entire title of the journal article. is always in English; + * those titles originally published in a non-English language and translated for are enclosed in square brackets. + * All titles end with a period unless another punctuation mark such as a question mark or bracket is present. + * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl). + * Corporate/collective authors may appear at the end of for citations up to about the year 2000. * * @return the extracted pubmed Title */ @@ -250,4 +254,14 @@ public class PMArticle implements Serializable { public List getGrants() { return grants; } + + + public String getPmcId() { + return pmcId; + } + + public PMArticle setPmcId(String pmcId) { + this.pmcId = pmcId; + return this; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala index 49a271641..9102c12c4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -98,6 +98,7 @@ class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] { case "PMID" => currentArticle.setPmid(text.trim) case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) + if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim) case "Language" => currentArticle.setLanguage(text.trim) case "ISSN" => currentJournal.setIssn(text.trim) case "GrantID" => currentGrant.setGrantID(text.trim) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 92ad22c57..24a1fa62b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -4,9 +4,12 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf._ -import collection.JavaConverters._ +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils +import collection.JavaConverters._ import java.util.regex.Pattern +import scala.collection.mutable.ListBuffer /** */ @@ -14,6 +17,9 @@ object PubMedToOaf { val SUBJ_CLASS = "keywords" + val OAI_HEADER = "oai:pubmedcentral.nih.gov:" + val OLD_PMC_PREFIX = "od_______267::" + val urlMap = Map( "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "doi" -> "https://dx.doi.org/" @@ -50,6 +56,17 @@ object PubMedToOaf { null } + + def createOriginalOpenaireId(article:PMArticle) :String = { + if (StringUtils.isNotEmpty(article.getPmcId)) { + val md5 = DHPUtils.md5(s"$OAI_HEADER${article.getPmcId.replace("PMC","")}") + s"$OLD_PMC_PREFIX$md5" + } + else + null + + } + /** Create an instance of class extends Result * starting from OAF instanceType value * @@ -122,8 +139,9 @@ object PubMedToOaf { return null // MAP PMID into pid with classid = classname = pmid - val pidList: List[StructuredProperty] = List( - OafMapperUtils.structuredProperty( + val pidList = ListBuffer[StructuredProperty]() + + pidList += OafMapperUtils.structuredProperty( article.getPmid, PidType.pmid.toString, PidType.pmid.toString, @@ -131,7 +149,19 @@ object PubMedToOaf { ModelConstants.DNET_PID_TYPES, dataInfo ) - ) + + + if (StringUtils.isNotBlank(article.getPmcId)) + { + pidList += OafMapperUtils.structuredProperty( + article.getPmcId, + PidType.pmc.toString, + PidType.pmc.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + } if (pidList == null) return null @@ -186,6 +216,7 @@ object PubMedToOaf { val urlLists: List[String] = pidList .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .filter(t => t._1.nonEmpty) + .toList .map(t => t._1 + t._2) if (urlLists != null) pubmedInstance.setUrl(urlLists.asJava) @@ -262,7 +293,14 @@ object PubMedToOaf { if (authors != null && authors.nonEmpty) result.setAuthor(authors.asJava) - result.setOriginalId(pidList.map(s => s.getValue).asJava) + + if (StringUtils.isNotEmpty(article.getPmcId)) { + val originalIDS = ListBuffer[String]() + originalIDS += createOriginalOpenaireId(article) + pidList.map(s => s.getValue).foreach(p =>originalIDS += p) + result.setOriginalId(originalIDS.asJava) + } else + result.setOriginalId(pidList.map(s => s.getValue).asJava) result.setId(article.getPmid) diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml index 22da07e29..58a73ae5d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml @@ -195,7 +195,9 @@ Biochemical and biophysical research communications Biochem Biophys Res Commun - Delineation of the intimate details of the backbone conformation of pyridine nucleotide coenzymes in aqueous solution. + Delineation of the intimate details of the backbone conformation of pyridine nucleotide + coenzymes in aqueous solution. + 1173-9 @@ -473,7 +475,9 @@ Biochemical and biophysical research communications Biochem Biophys Res Commun - Effect of chloroquine on cultured fibroblasts: release of lysosomal hydrolases and inhibition of their uptake. + Effect of chloroquine on cultured fibroblasts: release of lysosomal hydrolases and + inhibition of their uptake. + 1338-43 @@ -657,7 +661,8 @@ Biochemical and biophysical research communications Biochem Biophys Res Commun - Atomic models for the polypeptide backbones of myohemerythrin and hemerythrin. + Atomic models for the polypeptide backbones of myohemerythrin and hemerythrin. + 1349-56 @@ -1627,7 +1632,9 @@ Biochemical pharmacology Biochem Pharmacol - Comparison between procaine and isocarboxazid metabolism in vitro by a liver microsomal amidase-esterase. + Comparison between procaine and isocarboxazid metabolism in vitro by a liver microsomal + amidase-esterase. + 1517-21 @@ -2030,7 +2037,9 @@ Biochemical pharmacology Biochem Pharmacol - Radiochemical assay of glutathione S-epoxide transferase and its enhancement by phenobarbital in rat liver in vivo. + Radiochemical assay of glutathione S-epoxide transferase and its enhancement by + phenobarbital in rat liver in vivo. + 1569-72 @@ -2350,7 +2359,9 @@ Biochemical pharmacology Biochem Pharmacol - Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled beta-adrenergic antagonists. + Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled + beta-adrenergic antagonists. + 1651-8 @@ -2598,7 +2609,9 @@ Biochemical pharmacology Biochem Pharmacol - The effect of adrenaline and of alpha- and beta-adrenergic blocking agents on ATP concentration and on incorporation of 32Pi into ATP in rat fat cells. + The effect of adrenaline and of alpha- and beta-adrenergic blocking agents on ATP + concentration and on incorporation of 32Pi into ATP in rat fat cells. + 1659-62 @@ -2851,7 +2864,9 @@ Biochemical pharmacology Biochem Pharmacol - Action of propranolol on mitochondrial functions--effects on energized ion fluxes in the presence of valinomycin. + Action of propranolol on mitochondrial functions--effects on energized ion fluxes in the + presence of valinomycin. + 1701-5 @@ -3265,7 +3280,8 @@ EC 2.6.1.16 - Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing) + Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing) + EC 2.7.- @@ -3324,7 +3340,9 @@ Glucosamine - Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing) + Glutamine-Fructose-6-Phosphate Transaminase + (Isomerizing) + metabolism @@ -3463,7 +3481,8 @@ Biochemical pharmacology Biochem Pharmacol - Inhibition of aldehyde reductase by acidic metabolites of the biogenic amines. + Inhibition of aldehyde reductase by acidic metabolites of the biogenic amines. + 1731-3 @@ -3696,7 +3715,9 @@ Biochemical pharmacology Biochem Pharmacol - Effects of 5,6-dihydroxytryptamine on tyrosine-hydroxylase activity in central catecholaminergic neurons of the rat. + Effects of 5,6-dihydroxytryptamine on tyrosine-hydroxylase activity in central + catecholaminergic neurons of the rat. + 1739-42 @@ -4602,12 +4623,19 @@ Arzneimittel-Forschung Arzneimittelforschung - [Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)]. + [Biochemical studies on camomile components/III. In vitro studies about the antipeptic + activity of (--)-alpha-bisabolol (author's transl)]. + 1352-4 - (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost. + (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not + caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 + percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol + only occurs in case of direct contact. In case of a previous contact with the substrate, the + inhibiting effect is lost. + @@ -4626,7 +4654,9 @@ English Abstract Journal Article - Biochemische Untersuchungen von Kamilleninhaltsstoffen. III. In-vitro-Versuche über die antipeptische Wirkung des (-)-alpha-Bisabolols + Biochemische Untersuchungen von Kamilleninhaltsstoffen. III. In-vitro-Versuche über die + antipeptische Wirkung des (-)-alpha-Bisabolols + Germany @@ -4753,12 +4783,37 @@ Arzneimittel-Forschung Arzneimittelforschung - [Demonstration of tumor inhibiting properties of a strongly immunostimulating low-molecular weight substance. Comparative studies with ifosfamide on the immuno-labile DS carcinosarcoma. Stimulation of the autoimmune activity for approx. 20 days by BA 1, a N-(2-cyanoethylene)-urea. Novel prophylactic possibilities]. + [Demonstration of tumor inhibiting properties of a strongly immunostimulating + low-molecular weight substance. Comparative studies with ifosfamide on the immuno-labile DS + carcinosarcoma. Stimulation of the autoimmune activity for approx. 20 days by BA 1, a + N-(2-cyanoethylene)-urea. Novel prophylactic possibilities]. + 1369-79 - A report is given on the recent discovery of outstanding immunological properties in BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under otherwise unchanged conditions the reference substance ifosfamide (IF) -- a further development of cyclophosphamide -- applied without hyperglycemia in its most efficient dosage of 47 percent LD50 (150 mg kg) brought about a recovery rate of 25 percent at a lethality of 18 percent. (Contrary to BA 1, 250-min hyperglycemia caused no further improvement of the recovery rate.) However this comparison is characterized by the fact that both substances exhibit two quite different (complementary) mechanisms of action. Leucocyte counts made after application of the said cancerostatics and dosages have shown a pronounced stimulation with BA 1 and with ifosfamide, the known suppression in the post-therapeutic interval usually found with standard cancerostatics. In combination with the cited plaque test for BA 1, blood pictures then allow conclusions on the immunity status. Since IF can be taken as one of the most efficient cancerostatics--there is no other chemotherapeutic known up to now that has a more significant effect on the DS carcinosarcoma in rats -- these findings are of special importance. Finally, the total amount of leucocytes and lymphocytes as well as their time behaviour was determined from the blood picture of tumour-free rats after i.v. application of BA 1. The thus obtained numerical values clearly show that further research work on the prophylactic use of this substance seems to be necessary and very promising. + A report is given on the recent discovery of outstanding immunological properties in + BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS + carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent + LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 + percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under + otherwise unchanged conditions the reference substance ifosfamide (IF) -- a further development + of cyclophosphamide -- applied without hyperglycemia in its most efficient dosage of 47 percent + LD50 (150 mg kg) brought about a recovery rate of 25 percent at a lethality of 18 percent. + (Contrary to BA 1, 250-min hyperglycemia caused no further improvement of the recovery rate.) + However this comparison is characterized by the fact that both substances exhibit two quite + different (complementary) mechanisms of action. Leucocyte counts made after application of the + said cancerostatics and dosages have shown a pronounced stimulation with BA 1 and with + ifosfamide, the known suppression in the post-therapeutic interval usually found with standard + cancerostatics. In combination with the cited plaque test for BA 1, blood pictures then allow + conclusions on the immunity status. Since IF can be taken as one of the most efficient + cancerostatics--there is no other chemotherapeutic known up to now that has a more significant + effect on the DS carcinosarcoma in rats -- these findings are of special importance. Finally, + the total amount of leucocytes and lymphocytes as well as their time behaviour was determined + from the blood picture of tumour-free rats after i.v. application of BA 1. The thus obtained + numerical values clearly show that further research work on the prophylactic use of this + substance seems to be necessary and very promising. + @@ -4778,7 +4833,11 @@ English Abstract Journal Article - Nachweis krebshemmender Eigenschaften einer stark immunstimulierenden Verbindung kleiner Molekülmasse. Versuche am immunlabilen DS-Karzinosarkom im Vergleich mit Ifosfamid. Stimulierung der körpereigenen Abwehr über etwa 20 Tage durch BA 1, einen N-(2-Cyanthylen)-harnstoff. Neue prophylaktische Möglichkeiten + Nachweis krebshemmender Eigenschaften einer stark immunstimulierenden Verbindung + kleiner Molekülmasse. Versuche am immunlabilen DS-Karzinosarkom im Vergleich mit Ifosfamid. + Stimulierung der körpereigenen Abwehr über etwa 20 Tage durch BA 1, einen + N-(2-Cyanthylen)-harnstoff. Neue prophylaktische Möglichkeiten + Germany @@ -5016,7 +5075,20 @@ 1400-3 - The distribution of blood flow to the subendocardial, medium and subepicardial layers of the left ventricular free wall was studied in anaesthetized dogs under normoxic (A), hypoxic (B) conditions and under pharmacologically induced (etafenone) coronary vasodilation (C). Regional myocardial blood flow was determined by means of the particle distribution method. In normoxia a transmural gradient of flow was observed, with the subendocardial layers receiving a significantly higher flow rate compared with the subepicardial layers. In hypoxia induced vasodilation this transmural gradient of flow was persistent. In contrast a marked redistribution of regional flow was observed under pharmacologically induced vasodilation. The transmural gradient decreased. In contrast to some findings these experiments demonstrate that a considerable vasodilatory capacity exists in all layers of the myocardium and can be utilized by drugs. The differences observed for the intramural distribution pattern of flow under hypoxia and drug induced vasodilation support the hypothesis that this pattern reflects corresponding gradients of regional myocardial metabolism. + The distribution of blood flow to the subendocardial, medium and subepicardial layers + of the left ventricular free wall was studied in anaesthetized dogs under normoxic (A), hypoxic + (B) conditions and under pharmacologically induced (etafenone) coronary vasodilation (C). + Regional myocardial blood flow was determined by means of the particle distribution method. In + normoxia a transmural gradient of flow was observed, with the subendocardial layers receiving a + significantly higher flow rate compared with the subepicardial layers. In hypoxia induced + vasodilation this transmural gradient of flow was persistent. In contrast a marked + redistribution of regional flow was observed under pharmacologically induced vasodilation. The + transmural gradient decreased. In contrast to some findings these experiments demonstrate that a + considerable vasodilatory capacity exists in all layers of the myocardium and can be utilized by + drugs. The differences observed for the intramural distribution pattern of flow under hypoxia + and drug induced vasodilation support the hypothesis that this pattern reflects corresponding + gradients of regional myocardial metabolism. + @@ -5185,4 +5257,151 @@ + + + 4917185 + + 1970 + 10 + 27 + + + 2018 + 11 + 13 + +
+ + 0003-6919 + + 19 + 6 + + 1970 + Jun + + + Applied microbiology + Appl Microbiol + + Bactericidal activity of a broad-spectrum illumination source. + + 1013-4 + + + + Several hours of exposure to Vita-Lite lamps, which have a unique spectral + distribution, give significant killing of cells of Staphylococcus aureus. + + + + + Himmelfarb + P + P + + + Scott + A + A + + + Thayer + P S + PS + + + eng + + Journal Article + +
+ + United States + Appl Microbiol + 7605802 + 0003-6919 + + IM + + + Bacteriological Techniques + instrumentation + + + Light + + + Radiation Effects + + + Serratia marcescens + growth & development + radiation effects + + + Staphylococcus + growth & development + radiation effects + + + Sterilization + + +
+ + + + 1970 + 6 + 1 + + + 1970 + 6 + 1 + 0 + 1 + + + 1970 + 6 + 1 + 0 + 0 + + + ppublish + + 4917185 + PMC376844 + + + + Photochem Photobiol. 1969 Jan;9(1):99-102 + + 4889809 + + + + Endocrinology. 1969 Dec;85(6):1218-21 + + 5347623 + + + + Arch Mikrobiol. 1956;24(1):60-79 + + 13327987 + + + + J Bacteriol. 1941 Sep;42(3):353-66 + + 16560457 + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index ea742a04a..b021e5e07 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -2,9 +2,10 @@ package eu.dnetlib.dhp.sx.bio import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest -import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} +import eu.dnetlib.dhp.schema.oaf.utils.PidType +import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf} +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf} import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse @@ -16,6 +17,7 @@ import org.mockito.junit.jupiter.MockitoExtension import java.io.{BufferedReader, InputStream, InputStreamReader} import java.util.zip.GZIPInputStream import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer import scala.io.Source import scala.xml.pull.XMLEventReader @@ -72,6 +74,102 @@ class BioScholixTest extends AbstractVocabularyTest { ) println(mapper.writeValueAsString(r.head)) + } + + + private def checkPMArticle(article:PMArticle): Unit = { + assertNotNull(article.getPmid) + assertNotNull(article.getTitle) + assertNotNull(article.getAuthors) + article.getAuthors.asScala.foreach{a => + assertNotNull(a) + assertNotNull(a.getFullName) + } + + } + + @Test + def testParsingPubmedXML():Unit = { + val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))) + val parser = new PMParser(xml) + parser.foreach(checkPMArticle) + } + + + private def checkPubmedPublication(o:Oaf): Unit = { + assertTrue(o.isInstanceOf[Publication]) + val p:Publication = o.asInstanceOf[Publication] + assertNotNull(p.getId) + assertNotNull(p.getTitle) + p.getTitle.asScala.foreach(t =>assertNotNull(t.getValue)) + p.getAuthor.asScala.foreach(a =>assertNotNull(a.getFullname)) + assertNotNull(p.getInstance()) + p.getInstance().asScala.foreach { i => + assertNotNull(i.getCollectedfrom) + assertNotNull(i.getPid) + assertNotNull(i.getInstancetype) + } + assertNotNull(p.getOriginalId) + p.getOriginalId.asScala.foreach(oId => assertNotNull(oId)) + + + val hasPMC = p.getInstance().asScala.exists(i => i.getPid.asScala.exists(pid => pid.getQualifier.getClassid.equalsIgnoreCase(PidType.pmc.toString))) + + + + if (hasPMC) { + assertTrue(p.getOriginalId.asScala.exists(oId => oId.startsWith("od_______267::"))) + } + } + + + @Test + def testPubmedOriginalID():Unit = { + val article:PMArticle = new PMArticle + + + article.setPmid("1234") + + article.setTitle("a Title") + + // VERIFY PUBLICATION IS NOT NULL + article.getPublicationTypes.add( new PMSubject("article",null, null)) + var publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication] + assertNotNull(publication) + assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId) + + // VERIFY PUBLICATION ID DOES NOT CHANGE ALSO IF SETTING PMC IDENTIFIER + article.setPmcId("PMC1517292") + publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication] + assertNotNull(publication) + assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId) + + // VERIFY ORIGINAL ID GENERATE IN OLD WAY USING PMC IDENTIFIER EXISTS + + + val oldOpenaireID ="od_______267::0000072375bc0e68fa09d4e6b7658248" + + val hasOldOpenAIREID = publication.getOriginalId.asScala.exists(o => o.equalsIgnoreCase(oldOpenaireID)) + + assertTrue(hasOldOpenAIREID) + } + + + @Test + def testPubmedMapping() :Unit = { + + val xml = new XMLEventReader(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))) + val parser = new PMParser(xml) + val results = ListBuffer[Oaf]() + parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies)) + + + + + results.foreach(checkPubmedPublication) + + + } @Test diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index b9de5dd11..730e8a3fe 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -28,28 +28,6 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final Qualifier EOSC_QUALIFIER = OafMapperUtils - .qualifier( - "EOSC", - "European Open Science Cloud", - ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); - public static final DataInfo EOSC_DATAINFO = OafMapperUtils - .dataInfo( - false, "propagation", true, false, - OafMapperUtils - .qualifier( - "propagation:subject", "Inferred by OpenAIRE", - ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.9"); - public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils - .structuredProperty( - "EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_GALAXY = OafMapperUtils - .structuredProperty( - "EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_TWITTER = OafMapperUtils - .structuredProperty( - "EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -84,29 +62,35 @@ public class SparkEoscTag { }); } + public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) { + EoscIfGuidelines eig = new EoscIfGuidelines(); + eig.setCode(code); + eig.setLabel(label); + eig.setUrl(url); + eig.setSemanticRelation(semantics); + return eig; + + } + private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { readPath(spark, inputPath + "/software", Software.class) .map((MapFunction) s -> { - List sbject; - if (!Optional.ofNullable(s.getSubject()).isPresent()) - s.setSubject(new ArrayList<>()); - sbject = s.getSubject(); if (containsCriteriaNotebook(s)) { - sbject.add(EOSC_NOTEBOOK); - if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) { - sbject = sbject.stream().map(sb -> { - if (sb.getValue().equals("EOSC Jupyter Notebook")) { - return null; - } - return sb; - }).filter(Objects::nonNull).collect(Collectors.toList()); - s.setSubject(sbject); - } + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + addEIG( + s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", + "compliesWith"); + } if (containsCriteriaGalaxy(s)) { - sbject.add(EOSC_GALAXY); + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + + addEIG( + s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } return s; }, Encoders.bean(Software.class)) @@ -123,15 +107,17 @@ public class SparkEoscTag { readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) .map((MapFunction) orp -> { - List sbject; - if (!Optional.ofNullable(orp.getSubject()).isPresent()) - orp.setSubject(new ArrayList<>()); - sbject = orp.getSubject(); + + if (!Optional.ofNullable(orp.getEoscifguidelines()).isPresent()) + orp.setEoscifguidelines(new ArrayList<>()); + if (containsCriteriaGalaxy(orp)) { - sbject.add(EOSC_GALAXY); + addEIG( + orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", + "compliesWith"); } if (containscriteriaTwitter(orp)) { - sbject.add(EOSC_TWITTER); + addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -148,12 +134,11 @@ public class SparkEoscTag { readPath(spark, inputPath + "/dataset", Dataset.class) .map((MapFunction) d -> { - List sbject; - if (!Optional.ofNullable(d.getSubject()).isPresent()) - d.setSubject(new ArrayList<>()); - sbject = d.getSubject(); + + if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) + d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - sbject.add(EOSC_TWITTER); + addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return d; }, Encoders.bean(Dataset.class)) @@ -169,6 +154,12 @@ public class SparkEoscTag { .json(inputPath + "/dataset"); } + private static void addEIG(List eoscifguidelines, String code, String label, String url, + String sem) { + if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) + eoscifguidelines.add(newInstance(code, label, url, sem)); + } + private static boolean containscriteriaTwitter(Result r) { Set words = getWordsSP(r.getTitle()); words.addAll(getWordsF(r.getDescription())); @@ -212,13 +203,6 @@ public class SparkEoscTag { return false; } - private static Set getSubjects(List s) { - Set subjects = new HashSet<>(); - s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" ")))); - s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase())); - return subjects; - } - private static Set getWordsSP(List elem) { Set words = new HashSet<>(); Optional @@ -242,9 +226,7 @@ public class SparkEoscTag { t -> words .addAll( Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))))); -// elem -// .forEach( -// t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))); + return words; } diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index 1ea254157..5f47da10e 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -126,10 +126,23 @@ public class EOSCTagJobTest { .assertEquals( 4, tmp + .filter(s -> s.getEoscifguidelines() != null) .filter( - s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions .assertEquals( 1, tmp @@ -140,6 +153,16 @@ public class EOSCTagJobTest { .size()); Assertions .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + + Assertions + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() @@ -166,16 +189,24 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) + .collect() + .get(0) + .getEoscifguidelines() == null); + Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() @@ -183,6 +214,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -201,17 +249,24 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() @@ -219,14 +274,31 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertEquals( + 1, + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .collect() .get(0) .getSubject(); - Assertions.assertEquals(8, subjects.size()); - Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertEquals(7, subjects.size()); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire"))); @@ -250,6 +322,17 @@ public class EOSCTagJobTest { .filter( ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) + .filter( + ds -> ds + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + .count()); Assertions .assertEquals( @@ -264,7 +347,22 @@ public class EOSCTagJobTest { .textFile(workingDir.toString() + "/input/otherresearchproduct") .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .filter( - ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + orp -> orp + .getSubject() + .stream() + .anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + .count()); + + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/otherresearchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) + .filter( + orp -> orp + .getSubject() + .stream() + .anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) .count()); // spark.stop(); @@ -326,22 +424,41 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, tmp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( - 2, tmp + 1, tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() @@ -350,6 +467,24 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); + Assertions .assertEquals( 5, tmp @@ -385,22 +520,34 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, orp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); Assertions .assertEquals( - 3, orp + 1, orp + .filter(o -> o.getEoscifguidelines() != null) + .filter( + o -> o + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); + + Assertions + .assertEquals( + 2, orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() @@ -408,6 +555,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -516,10 +680,20 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 3, + 0, orp .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .count()); + Assertions + .assertEquals( + 3, + orp + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .count()); JavaRDD dats = sc .textFile(workingDir.toString() + "/input/dataset") @@ -531,7 +705,11 @@ public class EOSCTagJobTest { .assertEquals( 3, dats - .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .count()); } diff --git a/pom.xml b/pom.xml index 821ce3124..973bc3773 100644 --- a/pom.xml +++ b/pom.xml @@ -801,7 +801,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.12.1] + [2.12.2-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6]