1
0
Fork 0

Merge commit 'efd96e7e664e4139321e35e8d172b884ba4b61a1' into beta2master_sept_2022

This commit is contained in:
Claudio Atzori 2022-09-16 15:38:56 +02:00
commit cbd48bc645
39 changed files with 2861 additions and 641 deletions

View File

@ -419,4 +419,62 @@ public class OafMapperUtils {
m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo))); m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
return m; return m;
} }
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity,
final String validationDate) {
return getRelation(
source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(),
entity.getLastupdatetimestamp(), validationDate, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp) {
return getRelation(
source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp,
final String validationDate,
final List<KeyValue> properties) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
rel.setCollectedfrom(collectedfrom);
rel.setDataInfo(dataInfo);
rel.setLastupdatetimestamp(lastupdatetimestamp);
rel.setValidated(StringUtils.isNotBlank(validationDate));
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
rel.setProperties(properties);
return rel;
}
} }

View File

@ -118,7 +118,7 @@ public class CollectorWorker extends ReportingJob {
return new RestCollectorPlugin(clientParams); return new RestCollectorPlugin(clientParams);
case file: case file:
return new FileCollectorPlugin(fileSystem); return new FileCollectorPlugin(fileSystem);
case fileGZip: case fileGzip:
return new FileGZipCollectorPlugin(fileSystem); return new FileGZipCollectorPlugin(fileSystem);
case other: case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional final CollectorPlugin.NAME.OTHER_NAME plugin = Optional

View File

@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
public interface CollectorPlugin { public interface CollectorPlugin {
enum NAME { enum NAME {
oai, other, rest_json2xml, file, fileGZip; oai, other, rest_json2xml, file, fileGzip;
public enum OTHER_NAME { public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb mdstore_mongodb_dump, mdstore_mongodb

View File

@ -17,6 +17,9 @@ public class PMArticle implements Serializable {
* the Pubmed Identifier * the Pubmed Identifier
*/ */
private String pmid; private String pmid;
private String pmcId;
/** /**
* the DOI * the DOI
*/ */
@ -140,10 +143,11 @@ public class PMArticle implements Serializable {
} }
/** /**
* English-language abstracts are taken directly from the published article. * <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
* If the article does not have a published abstract, the National Library of Medicine does not create one, * those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally * All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
* Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
* *
* @return the extracted pubmed Title * @return the extracted pubmed Title
*/ */
@ -250,4 +254,13 @@ public class PMArticle implements Serializable {
public List<PMGrant> getGrants() { public List<PMGrant> getGrants() {
return grants; return grants;
} }
public String getPmcId() {
return pmcId;
}
public PMArticle setPmcId(String pmcId) {
this.pmcId = pmcId;
return this;
}
} }

View File

@ -584,7 +584,12 @@ object DataciteToOAFTransformation {
JField("awardUri", JString(awardUri)) <- fundingReferences JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri } yield awardUri
val oid = result.getId
result.setId(IdentifierFactory.createIdentifier(result)) result.setId(IdentifierFactory.createIdentifier(result))
if (!result.getId.equalsIgnoreCase(oid)) {
result.setOriginalId((oid :: List(doi)).asJava)
}
var relations: List[Relation] = var relations: List[Relation] =
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)

View File

@ -98,6 +98,7 @@ class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
case "PMID" => currentArticle.setPmid(text.trim) case "PMID" => currentArticle.setPmid(text.trim)
case "ArticleId" => case "ArticleId" =>
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
case "Language" => currentArticle.setLanguage(text.trim) case "Language" => currentArticle.setLanguage(text.trim)
case "ISSN" => currentJournal.setIssn(text.trim) case "ISSN" => currentJournal.setIssn(text.trim)
case "GrantID" => currentGrant.setGrantID(text.trim) case "GrantID" => currentGrant.setGrantID(text.trim)

View File

@ -4,9 +4,12 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf._
import collection.JavaConverters._ import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import collection.JavaConverters._
import java.util.regex.Pattern import java.util.regex.Pattern
import scala.collection.mutable.ListBuffer
/** /**
*/ */
@ -14,6 +17,9 @@ object PubMedToOaf {
val SUBJ_CLASS = "keywords" val SUBJ_CLASS = "keywords"
val OAI_HEADER = "oai:pubmedcentral.nih.gov:"
val OLD_PMC_PREFIX = "od_______267::"
val urlMap = Map( val urlMap = Map(
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
"doi" -> "https://dx.doi.org/" "doi" -> "https://dx.doi.org/"
@ -50,6 +56,15 @@ object PubMedToOaf {
null null
} }
def createOriginalOpenaireId(article: PMArticle): String = {
if (StringUtils.isNotEmpty(article.getPmcId)) {
val md5 = DHPUtils.md5(s"$OAI_HEADER${article.getPmcId.replace("PMC", "")}")
s"$OLD_PMC_PREFIX$md5"
} else
null
}
/** Create an instance of class extends Result /** Create an instance of class extends Result
* starting from OAF instanceType value * starting from OAF instanceType value
* *
@ -122,8 +137,9 @@ object PubMedToOaf {
return null return null
// MAP PMID into pid with classid = classname = pmid // MAP PMID into pid with classid = classname = pmid
val pidList: List[StructuredProperty] = List( val pidList = ListBuffer[StructuredProperty]()
OafMapperUtils.structuredProperty(
pidList += OafMapperUtils.structuredProperty(
article.getPmid, article.getPmid,
PidType.pmid.toString, PidType.pmid.toString,
PidType.pmid.toString, PidType.pmid.toString,
@ -131,7 +147,17 @@ object PubMedToOaf {
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
dataInfo dataInfo
) )
if (StringUtils.isNotBlank(article.getPmcId)) {
pidList += OafMapperUtils.structuredProperty(
article.getPmcId,
PidType.pmc.toString,
PidType.pmc.toString,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES,
dataInfo
) )
}
if (pidList == null) if (pidList == null)
return null return null
@ -186,6 +212,7 @@ object PubMedToOaf {
val urlLists: List[String] = pidList val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
.filter(t => t._1.nonEmpty) .filter(t => t._1.nonEmpty)
.toList
.map(t => t._1 + t._2) .map(t => t._1 + t._2)
if (urlLists != null) if (urlLists != null)
pubmedInstance.setUrl(urlLists.asJava) pubmedInstance.setUrl(urlLists.asJava)
@ -262,6 +289,13 @@ object PubMedToOaf {
if (authors != null && authors.nonEmpty) if (authors != null && authors.nonEmpty)
result.setAuthor(authors.asJava) result.setAuthor(authors.asJava)
if (StringUtils.isNotEmpty(article.getPmcId)) {
val originalIDS = ListBuffer[String]()
originalIDS += createOriginalOpenaireId(article)
pidList.map(s => s.getValue).foreach(p => originalIDS += p)
result.setOriginalId(originalIDS.asJava)
} else
result.setOriginalId(pidList.map(s => s.getValue).asJava) result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(article.getPmid) result.setId(article.getPmid)

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.actionmanager.ror; package eu.dnetlib.dhp.actionmanager.ror;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.util.List; import java.util.List;
@ -38,25 +39,20 @@ class GenerateRorActionSetJobTest {
.readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class);
final List<AtomicAction<? extends Oaf>> aas = GenerateRorActionSetJob.convertRorOrg(r); final List<AtomicAction<? extends Oaf>> aas = GenerateRorActionSetJob.convertRorOrg(r);
Assertions.assertEquals(3, aas.size()); Assertions.assertEquals(1, aas.size());
assertEquals(Organization.class, aas.get(0).getClazz()); assertEquals(Organization.class, aas.get(0).getClazz());
assertEquals(Relation.class, aas.get(1).getClazz());
assertEquals(Relation.class, aas.get(2).getClazz());
final Organization o = (Organization) aas.get(0).getPayload(); final Organization o = (Organization) aas.get(0).getPayload();
final Relation r1 = (Relation) aas.get(1).getPayload();
final Relation r2 = (Relation) aas.get(2).getPayload();
assertEquals(o.getId(), r1.getSource()); assertNotNull(o);
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget()); assertNotNull(o.getCountry());
assertEquals(ModelConstants.IS_PARENT_OF, r1.getRelClass()); assertEquals("AU", o.getCountry().getClassid());
assertEquals(ModelConstants.IS_CHILD_OF, r2.getRelClass());
assertNotNull(o.getLegalname());
assertEquals("Mount Stromlo Observatory", o.getLegalname().getValue());
System.out.println(mapper.writeValueAsString(o)); System.out.println(mapper.writeValueAsString(o));
System.out.println(mapper.writeValueAsString(r1));
System.out.println(mapper.writeValueAsString(r2));
} }
@Test @Test

View File

@ -195,7 +195,9 @@
<Title>Biochemical and biophysical research communications</Title> <Title>Biochemical and biophysical research communications</Title>
<ISOAbbreviation>Biochem Biophys Res Commun</ISOAbbreviation> <ISOAbbreviation>Biochem Biophys Res Commun</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Delineation of the intimate details of the backbone conformation of pyridine nucleotide coenzymes in aqueous solution.</ArticleTitle> <ArticleTitle>Delineation of the intimate details of the backbone conformation of pyridine nucleotide
coenzymes in aqueous solution.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1173-9</MedlinePgn> <MedlinePgn>1173-9</MedlinePgn>
</Pagination> </Pagination>
@ -473,7 +475,9 @@
<Title>Biochemical and biophysical research communications</Title> <Title>Biochemical and biophysical research communications</Title>
<ISOAbbreviation>Biochem Biophys Res Commun</ISOAbbreviation> <ISOAbbreviation>Biochem Biophys Res Commun</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Effect of chloroquine on cultured fibroblasts: release of lysosomal hydrolases and inhibition of their uptake.</ArticleTitle> <ArticleTitle>Effect of chloroquine on cultured fibroblasts: release of lysosomal hydrolases and
inhibition of their uptake.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1338-43</MedlinePgn> <MedlinePgn>1338-43</MedlinePgn>
</Pagination> </Pagination>
@ -657,7 +661,8 @@
<Title>Biochemical and biophysical research communications</Title> <Title>Biochemical and biophysical research communications</Title>
<ISOAbbreviation>Biochem Biophys Res Commun</ISOAbbreviation> <ISOAbbreviation>Biochem Biophys Res Commun</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Atomic models for the polypeptide backbones of myohemerythrin and hemerythrin.</ArticleTitle> <ArticleTitle>Atomic models for the polypeptide backbones of myohemerythrin and hemerythrin.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1349-56</MedlinePgn> <MedlinePgn>1349-56</MedlinePgn>
</Pagination> </Pagination>
@ -1627,7 +1632,9 @@
<Title>Biochemical pharmacology</Title> <Title>Biochemical pharmacology</Title>
<ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation> <ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Comparison between procaine and isocarboxazid metabolism in vitro by a liver microsomal amidase-esterase.</ArticleTitle> <ArticleTitle>Comparison between procaine and isocarboxazid metabolism in vitro by a liver microsomal
amidase-esterase.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1517-21</MedlinePgn> <MedlinePgn>1517-21</MedlinePgn>
</Pagination> </Pagination>
@ -2030,7 +2037,9 @@
<Title>Biochemical pharmacology</Title> <Title>Biochemical pharmacology</Title>
<ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation> <ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Radiochemical assay of glutathione S-epoxide transferase and its enhancement by phenobarbital in rat liver in vivo.</ArticleTitle> <ArticleTitle>Radiochemical assay of glutathione S-epoxide transferase and its enhancement by
phenobarbital in rat liver in vivo.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1569-72</MedlinePgn> <MedlinePgn>1569-72</MedlinePgn>
</Pagination> </Pagination>
@ -2350,7 +2359,9 @@
<Title>Biochemical pharmacology</Title> <Title>Biochemical pharmacology</Title>
<ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation> <ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled beta-adrenergic antagonists.</ArticleTitle> <ArticleTitle>Identification of adenylate cyclase-coupled beta-adrenergic receptors with radiolabeled
beta-adrenergic antagonists.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1651-8</MedlinePgn> <MedlinePgn>1651-8</MedlinePgn>
</Pagination> </Pagination>
@ -2598,7 +2609,9 @@
<Title>Biochemical pharmacology</Title> <Title>Biochemical pharmacology</Title>
<ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation> <ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>The effect of adrenaline and of alpha- and beta-adrenergic blocking agents on ATP concentration and on incorporation of 32Pi into ATP in rat fat cells.</ArticleTitle> <ArticleTitle>The effect of adrenaline and of alpha- and beta-adrenergic blocking agents on ATP
concentration and on incorporation of 32Pi into ATP in rat fat cells.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1659-62</MedlinePgn> <MedlinePgn>1659-62</MedlinePgn>
</Pagination> </Pagination>
@ -2851,7 +2864,9 @@
<Title>Biochemical pharmacology</Title> <Title>Biochemical pharmacology</Title>
<ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation> <ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Action of propranolol on mitochondrial functions--effects on energized ion fluxes in the presence of valinomycin.</ArticleTitle> <ArticleTitle>Action of propranolol on mitochondrial functions--effects on energized ion fluxes in the
presence of valinomycin.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1701-5</MedlinePgn> <MedlinePgn>1701-5</MedlinePgn>
</Pagination> </Pagination>
@ -3265,7 +3280,8 @@
</Chemical> </Chemical>
<Chemical> <Chemical>
<RegistryNumber>EC 2.6.1.16</RegistryNumber> <RegistryNumber>EC 2.6.1.16</RegistryNumber>
<NameOfSubstance UI="D005945">Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing)</NameOfSubstance> <NameOfSubstance UI="D005945">Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing)
</NameOfSubstance>
</Chemical> </Chemical>
<Chemical> <Chemical>
<RegistryNumber>EC 2.7.-</RegistryNumber> <RegistryNumber>EC 2.7.-</RegistryNumber>
@ -3324,7 +3340,9 @@
<DescriptorName UI="D005944" MajorTopicYN="N">Glucosamine</DescriptorName> <DescriptorName UI="D005944" MajorTopicYN="N">Glucosamine</DescriptorName>
</MeshHeading> </MeshHeading>
<MeshHeading> <MeshHeading>
<DescriptorName UI="D005945" MajorTopicYN="N">Glutamine-Fructose-6-Phosphate Transaminase (Isomerizing)</DescriptorName> <DescriptorName UI="D005945" MajorTopicYN="N">Glutamine-Fructose-6-Phosphate Transaminase
(Isomerizing)
</DescriptorName>
<QualifierName UI="Q000378" MajorTopicYN="N">metabolism</QualifierName> <QualifierName UI="Q000378" MajorTopicYN="N">metabolism</QualifierName>
</MeshHeading> </MeshHeading>
<MeshHeading> <MeshHeading>
@ -3463,7 +3481,8 @@
<Title>Biochemical pharmacology</Title> <Title>Biochemical pharmacology</Title>
<ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation> <ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Inhibition of aldehyde reductase by acidic metabolites of the biogenic amines.</ArticleTitle> <ArticleTitle>Inhibition of aldehyde reductase by acidic metabolites of the biogenic amines.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1731-3</MedlinePgn> <MedlinePgn>1731-3</MedlinePgn>
</Pagination> </Pagination>
@ -3696,7 +3715,9 @@
<Title>Biochemical pharmacology</Title> <Title>Biochemical pharmacology</Title>
<ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation> <ISOAbbreviation>Biochem Pharmacol</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>Effects of 5,6-dihydroxytryptamine on tyrosine-hydroxylase activity in central catecholaminergic neurons of the rat.</ArticleTitle> <ArticleTitle>Effects of 5,6-dihydroxytryptamine on tyrosine-hydroxylase activity in central
catecholaminergic neurons of the rat.
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1739-42</MedlinePgn> <MedlinePgn>1739-42</MedlinePgn>
</Pagination> </Pagination>
@ -4602,12 +4623,19 @@
<Title>Arzneimittel-Forschung</Title> <Title>Arzneimittel-Forschung</Title>
<ISOAbbreviation>Arzneimittelforschung</ISOAbbreviation> <ISOAbbreviation>Arzneimittelforschung</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>[Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)].</ArticleTitle> <ArticleTitle>[Biochemical studies on camomile components/III. In vitro studies about the antipeptic
activity of (--)-alpha-bisabolol (author's transl)].
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1352-4</MedlinePgn> <MedlinePgn>1352-4</MedlinePgn>
</Pagination> </Pagination>
<Abstract> <Abstract>
<AbstractText>(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.</AbstractText> <AbstractText>(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not
caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50
percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol
only occurs in case of direct contact. In case of a previous contact with the substrate, the
inhibiting effect is lost.
</AbstractText>
</Abstract> </Abstract>
<AuthorList CompleteYN="Y"> <AuthorList CompleteYN="Y">
<Author ValidYN="Y"> <Author ValidYN="Y">
@ -4626,7 +4654,9 @@
<PublicationType UI="D004740">English Abstract</PublicationType> <PublicationType UI="D004740">English Abstract</PublicationType>
<PublicationType UI="D016428">Journal Article</PublicationType> <PublicationType UI="D016428">Journal Article</PublicationType>
</PublicationTypeList> </PublicationTypeList>
<VernacularTitle>Biochemische Untersuchungen von Kamilleninhaltsstoffen. III. In-vitro-Versuche über die antipeptische Wirkung des (-)-alpha-Bisabolols</VernacularTitle> <VernacularTitle>Biochemische Untersuchungen von Kamilleninhaltsstoffen. III. In-vitro-Versuche über die
antipeptische Wirkung des (-)-alpha-Bisabolols
</VernacularTitle>
</Article> </Article>
<MedlineJournalInfo> <MedlineJournalInfo>
<Country>Germany</Country> <Country>Germany</Country>
@ -4753,12 +4783,37 @@
<Title>Arzneimittel-Forschung</Title> <Title>Arzneimittel-Forschung</Title>
<ISOAbbreviation>Arzneimittelforschung</ISOAbbreviation> <ISOAbbreviation>Arzneimittelforschung</ISOAbbreviation>
</Journal> </Journal>
<ArticleTitle>[Demonstration of tumor inhibiting properties of a strongly immunostimulating low-molecular weight substance. Comparative studies with ifosfamide on the immuno-labile DS carcinosarcoma. Stimulation of the autoimmune activity for approx. 20 days by BA 1, a N-(2-cyanoethylene)-urea. Novel prophylactic possibilities].</ArticleTitle> <ArticleTitle>[Demonstration of tumor inhibiting properties of a strongly immunostimulating
low-molecular weight substance. Comparative studies with ifosfamide on the immuno-labile DS
carcinosarcoma. Stimulation of the autoimmune activity for approx. 20 days by BA 1, a
N-(2-cyanoethylene)-urea. Novel prophylactic possibilities].
</ArticleTitle>
<Pagination> <Pagination>
<MedlinePgn>1369-79</MedlinePgn> <MedlinePgn>1369-79</MedlinePgn>
</Pagination> </Pagination>
<Abstract> <Abstract>
<AbstractText>A report is given on the recent discovery of outstanding immunological properties in BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40 percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under otherwise unchanged conditions the reference substance ifosfamide (IF) -- a further development of cyclophosphamide -- applied without hyperglycemia in its most efficient dosage of 47 percent LD50 (150 mg kg) brought about a recovery rate of 25 percent at a lethality of 18 percent. (Contrary to BA 1, 250-min hyperglycemia caused no further improvement of the recovery rate.) However this comparison is characterized by the fact that both substances exhibit two quite different (complementary) mechanisms of action. Leucocyte counts made after application of the said cancerostatics and dosages have shown a pronounced stimulation with BA 1 and with ifosfamide, the known suppression in the post-therapeutic interval usually found with standard cancerostatics. In combination with the cited plaque test for BA 1, blood pictures then allow conclusions on the immunity status. Since IF can be taken as one of the most efficient cancerostatics--there is no other chemotherapeutic known up to now that has a more significant effect on the DS carcinosarcoma in rats -- these findings are of special importance. Finally, the total amount of leucocytes and lymphocytes as well as their time behaviour was determined from the blood picture of tumour-free rats after i.v. application of BA 1. The thus obtained numerical values clearly show that further research work on the prophylactic use of this substance seems to be necessary and very promising.</AbstractText> <AbstractText>A report is given on the recent discovery of outstanding immunological properties in
BA 1 [N-(2-cyanoethylene)-urea] having a (low) molecular mass M = 111.104. Experiments in 214 DS
carcinosarcoma bearing Wistar rats have shown that BA 1, at a dosage of only about 12 percent
LD50 (150 mg kg) and negligible lethality (1.7 percent), results in a recovery rate of 40
percent without hyperglycemia and, in one test, of 80 percent with hyperglycemia. Under
otherwise unchanged conditions the reference substance ifosfamide (IF) -- a further development
of cyclophosphamide -- applied without hyperglycemia in its most efficient dosage of 47 percent
LD50 (150 mg kg) brought about a recovery rate of 25 percent at a lethality of 18 percent.
(Contrary to BA 1, 250-min hyperglycemia caused no further improvement of the recovery rate.)
However this comparison is characterized by the fact that both substances exhibit two quite
different (complementary) mechanisms of action. Leucocyte counts made after application of the
said cancerostatics and dosages have shown a pronounced stimulation with BA 1 and with
ifosfamide, the known suppression in the post-therapeutic interval usually found with standard
cancerostatics. In combination with the cited plaque test for BA 1, blood pictures then allow
conclusions on the immunity status. Since IF can be taken as one of the most efficient
cancerostatics--there is no other chemotherapeutic known up to now that has a more significant
effect on the DS carcinosarcoma in rats -- these findings are of special importance. Finally,
the total amount of leucocytes and lymphocytes as well as their time behaviour was determined
from the blood picture of tumour-free rats after i.v. application of BA 1. The thus obtained
numerical values clearly show that further research work on the prophylactic use of this
substance seems to be necessary and very promising.
</AbstractText>
</Abstract> </Abstract>
<AuthorList CompleteYN="Y"> <AuthorList CompleteYN="Y">
<Author ValidYN="Y"> <Author ValidYN="Y">
@ -4778,7 +4833,11 @@
<PublicationType UI="D004740">English Abstract</PublicationType> <PublicationType UI="D004740">English Abstract</PublicationType>
<PublicationType UI="D016428">Journal Article</PublicationType> <PublicationType UI="D016428">Journal Article</PublicationType>
</PublicationTypeList> </PublicationTypeList>
<VernacularTitle>Nachweis krebshemmender Eigenschaften einer stark immunstimulierenden Verbindung kleiner Molekülmasse. Versuche am immunlabilen DS-Karzinosarkom im Vergleich mit Ifosfamid. Stimulierung der körpereigenen Abwehr über etwa 20 Tage durch BA 1, einen N-(2-Cyanthylen)-harnstoff. Neue prophylaktische Möglichkeiten</VernacularTitle> <VernacularTitle>Nachweis krebshemmender Eigenschaften einer stark immunstimulierenden Verbindung
kleiner Molekülmasse. Versuche am immunlabilen DS-Karzinosarkom im Vergleich mit Ifosfamid.
Stimulierung der körpereigenen Abwehr über etwa 20 Tage durch BA 1, einen
N-(2-Cyanthylen)-harnstoff. Neue prophylaktische Möglichkeiten
</VernacularTitle>
</Article> </Article>
<MedlineJournalInfo> <MedlineJournalInfo>
<Country>Germany</Country> <Country>Germany</Country>
@ -5016,7 +5075,20 @@
<MedlinePgn>1400-3</MedlinePgn> <MedlinePgn>1400-3</MedlinePgn>
</Pagination> </Pagination>
<Abstract> <Abstract>
<AbstractText>The distribution of blood flow to the subendocardial, medium and subepicardial layers of the left ventricular free wall was studied in anaesthetized dogs under normoxic (A), hypoxic (B) conditions and under pharmacologically induced (etafenone) coronary vasodilation (C). Regional myocardial blood flow was determined by means of the particle distribution method. In normoxia a transmural gradient of flow was observed, with the subendocardial layers receiving a significantly higher flow rate compared with the subepicardial layers. In hypoxia induced vasodilation this transmural gradient of flow was persistent. In contrast a marked redistribution of regional flow was observed under pharmacologically induced vasodilation. The transmural gradient decreased. In contrast to some findings these experiments demonstrate that a considerable vasodilatory capacity exists in all layers of the myocardium and can be utilized by drugs. The differences observed for the intramural distribution pattern of flow under hypoxia and drug induced vasodilation support the hypothesis that this pattern reflects corresponding gradients of regional myocardial metabolism.</AbstractText> <AbstractText>The distribution of blood flow to the subendocardial, medium and subepicardial layers
of the left ventricular free wall was studied in anaesthetized dogs under normoxic (A), hypoxic
(B) conditions and under pharmacologically induced (etafenone) coronary vasodilation (C).
Regional myocardial blood flow was determined by means of the particle distribution method. In
normoxia a transmural gradient of flow was observed, with the subendocardial layers receiving a
significantly higher flow rate compared with the subepicardial layers. In hypoxia induced
vasodilation this transmural gradient of flow was persistent. In contrast a marked
redistribution of regional flow was observed under pharmacologically induced vasodilation. The
transmural gradient decreased. In contrast to some findings these experiments demonstrate that a
considerable vasodilatory capacity exists in all layers of the myocardium and can be utilized by
drugs. The differences observed for the intramural distribution pattern of flow under hypoxia
and drug induced vasodilation support the hypothesis that this pattern reflects corresponding
gradients of regional myocardial metabolism.
</AbstractText>
</Abstract> </Abstract>
<AuthorList CompleteYN="Y"> <AuthorList CompleteYN="Y">
<Author ValidYN="Y"> <Author ValidYN="Y">
@ -5185,4 +5257,151 @@
</ReferenceList> </ReferenceList>
</PubmedData> </PubmedData>
</PubmedArticle> </PubmedArticle>
<PubmedArticle>
<MedlineCitation Status="MEDLINE" Owner="NLM">
<PMID Version="1">4917185</PMID>
<DateCompleted>
<Year>1970</Year>
<Month>10</Month>
<Day>27</Day>
</DateCompleted>
<DateRevised>
<Year>2018</Year>
<Month>11</Month>
<Day>13</Day>
</DateRevised>
<Article PubModel="Print">
<Journal>
<ISSN IssnType="Print">0003-6919</ISSN>
<JournalIssue CitedMedium="Print">
<Volume>19</Volume>
<Issue>6</Issue>
<PubDate>
<Year>1970</Year>
<Month>Jun</Month>
</PubDate>
</JournalIssue>
<Title>Applied microbiology</Title>
<ISOAbbreviation>Appl Microbiol</ISOAbbreviation>
</Journal>
<ArticleTitle>Bactericidal activity of a broad-spectrum illumination source.</ArticleTitle>
<Pagination>
<MedlinePgn>1013-4</MedlinePgn>
</Pagination>
<Abstract>
<AbstractText>Several hours of exposure to Vita-Lite lamps, which have a unique spectral
distribution, give significant killing of cells of Staphylococcus aureus.
</AbstractText>
</Abstract>
<AuthorList CompleteYN="Y">
<Author ValidYN="Y">
<LastName>Himmelfarb</LastName>
<ForeName>P</ForeName>
<Initials>P</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Scott</LastName>
<ForeName>A</ForeName>
<Initials>A</Initials>
</Author>
<Author ValidYN="Y">
<LastName>Thayer</LastName>
<ForeName>P S</ForeName>
<Initials>PS</Initials>
</Author>
</AuthorList>
<Language>eng</Language>
<PublicationTypeList>
<PublicationType UI="D016428">Journal Article</PublicationType>
</PublicationTypeList>
</Article>
<MedlineJournalInfo>
<Country>United States</Country>
<MedlineTA>Appl Microbiol</MedlineTA>
<NlmUniqueID>7605802</NlmUniqueID>
<ISSNLinking>0003-6919</ISSNLinking>
</MedlineJournalInfo>
<CitationSubset>IM</CitationSubset>
<MeshHeadingList>
<MeshHeading>
<DescriptorName UI="D001431" MajorTopicYN="N">Bacteriological Techniques</DescriptorName>
<QualifierName UI="Q000295" MajorTopicYN="Y">instrumentation</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D008027" MajorTopicYN="Y">Light</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D011830" MajorTopicYN="N">Radiation Effects</DescriptorName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D012706" MajorTopicYN="N">Serratia marcescens</DescriptorName>
<QualifierName UI="Q000254" MajorTopicYN="N">growth &amp; development</QualifierName>
<QualifierName UI="Q000528" MajorTopicYN="Y">radiation effects</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D013210" MajorTopicYN="N">Staphylococcus</DescriptorName>
<QualifierName UI="Q000254" MajorTopicYN="N">growth &amp; development</QualifierName>
<QualifierName UI="Q000528" MajorTopicYN="Y">radiation effects</QualifierName>
</MeshHeading>
<MeshHeading>
<DescriptorName UI="D013242" MajorTopicYN="N">Sterilization</DescriptorName>
</MeshHeading>
</MeshHeadingList>
</MedlineCitation>
<PubmedData>
<History>
<PubMedPubDate PubStatus="pubmed">
<Year>1970</Year>
<Month>6</Month>
<Day>1</Day>
</PubMedPubDate>
<PubMedPubDate PubStatus="medline">
<Year>1970</Year>
<Month>6</Month>
<Day>1</Day>
<Hour>0</Hour>
<Minute>1</Minute>
</PubMedPubDate>
<PubMedPubDate PubStatus="entrez">
<Year>1970</Year>
<Month>6</Month>
<Day>1</Day>
<Hour>0</Hour>
<Minute>0</Minute>
</PubMedPubDate>
</History>
<PublicationStatus>ppublish</PublicationStatus>
<ArticleIdList>
<ArticleId IdType="pubmed">4917185</ArticleId>
<ArticleId IdType="pmc">PMC376844</ArticleId>
</ArticleIdList>
<ReferenceList>
<Reference>
<Citation>Photochem Photobiol. 1969 Jan;9(1):99-102</Citation>
<ArticleIdList>
<ArticleId IdType="pubmed">4889809</ArticleId>
</ArticleIdList>
</Reference>
<Reference>
<Citation>Endocrinology. 1969 Dec;85(6):1218-21</Citation>
<ArticleIdList>
<ArticleId IdType="pubmed">5347623</ArticleId>
</ArticleIdList>
</Reference>
<Reference>
<Citation>Arch Mikrobiol. 1956;24(1):60-79</Citation>
<ArticleIdList>
<ArticleId IdType="pubmed">13327987</ArticleId>
</ArticleIdList>
</Reference>
<Reference>
<Citation>J Bacteriol. 1941 Sep;42(3):353-66</Citation>
<ArticleIdList>
<ArticleId IdType="pubmed">16560457</ArticleId>
</ArticleIdList>
</Reference>
</ReferenceList>
</PubmedData>
</PubmedArticle>
</PubmedArticleSet> </PubmedArticleSet>

View File

@ -2,11 +2,14 @@ package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
import org.apache.commons.io.FileUtils import org.apache.commons.io.FileUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, count} import org.apache.spark.sql.functions.{col, count}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
@ -70,17 +73,15 @@ class DataciteToOAFTest extends AbstractVocabularyTest {
assertEquals(100, nativeSize) assertEquals(100, nativeSize)
spark.read.load(targetPath).printSchema(); val result: Dataset[String] =
spark.read.text(targetPath).as[String].map(DataciteUtilityTest.convertToOAF)(Encoders.STRING)
val result: Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result result
.map(s => s.getClass.getSimpleName)
.groupBy(col("value").alias("class")) .groupBy(col("value").alias("class"))
.agg(count("value").alias("Total")) .agg(count("value").alias("Total"))
.show(false) .show(false)
val t = spark.read.load(targetPath).count() val t = spark.read.text(targetPath).as[String].count()
assertTrue(t > 0) assertTrue(t > 0)

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.datacite
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
object DataciteUtilityTest {
def convertToOAF(input: String): String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val isRelation: String = (json \\ "source").extractOrElse("NULL")
if (isRelation != "NULL") {
return "Relation"
}
val iType: List[String] = for {
JObject(instance) <- json \\ "instance"
JField("instancetype", JObject(instancetype)) <- instance
JField("classname", JString(classname)) <- instancetype
} yield classname
val l: String = iType.head.toLowerCase()
l
}
}

View File

@ -2,9 +2,10 @@ package eu.dnetlib.dhp.sx.bio
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} import eu.dnetlib.dhp.schema.oaf.utils.PidType
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf} import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -16,6 +17,7 @@ import org.mockito.junit.jupiter.MockitoExtension
import java.io.{BufferedReader, InputStream, InputStreamReader} import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.util.zip.GZIPInputStream import java.util.zip.GZIPInputStream
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer
import scala.io.Source import scala.io.Source
import scala.xml.pull.XMLEventReader import scala.xml.pull.XMLEventReader
@ -74,6 +76,95 @@ class BioScholixTest extends AbstractVocabularyTest {
} }
private def checkPMArticle(article: PMArticle): Unit = {
assertNotNull(article.getPmid)
assertNotNull(article.getTitle)
assertNotNull(article.getAuthors)
article.getAuthors.asScala.foreach { a =>
assertNotNull(a)
assertNotNull(a.getFullName)
}
}
@Test
def testParsingPubmedXML(): Unit = {
val xml = new XMLEventReader(
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
)
val parser = new PMParser(xml)
parser.foreach(checkPMArticle)
}
private def checkPubmedPublication(o: Oaf): Unit = {
assertTrue(o.isInstanceOf[Publication])
val p: Publication = o.asInstanceOf[Publication]
assertNotNull(p.getId)
assertNotNull(p.getTitle)
p.getTitle.asScala.foreach(t => assertNotNull(t.getValue))
p.getAuthor.asScala.foreach(a => assertNotNull(a.getFullname))
assertNotNull(p.getInstance())
p.getInstance().asScala.foreach { i =>
assertNotNull(i.getCollectedfrom)
assertNotNull(i.getPid)
assertNotNull(i.getInstancetype)
}
assertNotNull(p.getOriginalId)
p.getOriginalId.asScala.foreach(oId => assertNotNull(oId))
val hasPMC = p
.getInstance()
.asScala
.exists(i => i.getPid.asScala.exists(pid => pid.getQualifier.getClassid.equalsIgnoreCase(PidType.pmc.toString)))
if (hasPMC) {
assertTrue(p.getOriginalId.asScala.exists(oId => oId.startsWith("od_______267::")))
}
}
@Test
def testPubmedOriginalID(): Unit = {
val article: PMArticle = new PMArticle
article.setPmid("1234")
article.setTitle("a Title")
// VERIFY PUBLICATION IS NOT NULL
article.getPublicationTypes.add(new PMSubject("article", null, null))
var publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication]
assertNotNull(publication)
assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId)
// VERIFY PUBLICATION ID DOES NOT CHANGE ALSO IF SETTING PMC IDENTIFIER
article.setPmcId("PMC1517292")
publication = PubMedToOaf.convert(article, vocabularies).asInstanceOf[Publication]
assertNotNull(publication)
assertEquals("50|pmid________::81dc9bdb52d04dc20036dbd8313ed055", publication.getId)
// VERIFY ORIGINAL ID GENERATE IN OLD WAY USING PMC IDENTIFIER EXISTS
val oldOpenaireID = "od_______267::0000072375bc0e68fa09d4e6b7658248"
val hasOldOpenAIREID = publication.getOriginalId.asScala.exists(o => o.equalsIgnoreCase(oldOpenaireID))
assertTrue(hasOldOpenAIREID)
}
@Test
def testPubmedMapping(): Unit = {
val xml = new XMLEventReader(
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
)
val parser = new PMParser(xml)
val results = ListBuffer[Oaf]()
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
results.foreach(checkPubmedPublication)
}
@Test @Test
def testPDBToOAF(): Unit = { def testPDBToOAF(): Unit = {

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.bulktag.eosc;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 21/07/22
*/
public class DatasourceMaster implements Serializable {
private String datasource;
private String master;
public String getDatasource() {
return datasource;
}
public void setDatasource(String datasource) {
this.datasource = datasource;
}
public String getMaster() {
return master;
}
public void setMaster(String master) {
this.master = master;
}
}

View File

@ -0,0 +1,136 @@
package eu.dnetlib.dhp.bulktag.eosc;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.List;
import java.util.function.Consumer;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
* @author miriam.baglioni
* @Date 21/07/22
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.common.RelationInverse;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class ReadMasterDatasourceFromDB implements Closeable {
private final DbClient dbClient;
private static final Log log = LogFactory.getLog(ReadMasterDatasourceFromDB.class);
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT dso.id datasource, d.id master FROM " +
"(SELECT id FROM dsm_services WHERE id like 'eosc%') dso " +
"FULL JOIN " +
"(SELECT id, duplicate FROM dsm_dedup_services WHERE duplicate like 'eosc%')d " +
"ON dso.id = d.duplicate";
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
ReadMasterDatasourceFromDB.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("hdfsNameNode");
try (
final ReadMasterDatasourceFromDB rmd = new ReadMasterDatasourceFromDB(hdfsPath, hdfsNameNode, dbUrl, dbUser,
dbPassword)) {
log.info("Processing datasources...");
rmd.execute(QUERY, rmd::datasourceMasterMap);
}
}
public void execute(final String sql, final Function<ResultSet, DatasourceMaster> producer) {
dbClient.processResults(sql, rs -> writeMap(producer.apply(rs)));
}
public DatasourceMaster datasourceMasterMap(ResultSet rs) {
try {
DatasourceMaster dm = new DatasourceMaster();
String datasource = rs.getString("datasource");
dm.setDatasource(datasource);
String master = rs.getString("master");
if (StringUtils.isNotBlank(master))
dm.setMaster(OafMapperUtils.createOpenaireId(10, master, true));
else
dm.setMaster(OafMapperUtils.createOpenaireId(10, datasource, true));
return dm;
} catch (final SQLException e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
dbClient.close();
writer.close();
}
public ReadMasterDatasourceFromDB(
final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword)
throws IOException {
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fsDataOutputStream = fileSystem.append(hdfsWritePath);
} else {
fsDataOutputStream = fileSystem.create(hdfsWritePath);
}
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
}
protected void writeMap(final DatasourceMaster dm) {
try {
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
writer.newLine();
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,170 @@
package eu.dnetlib.dhp.bulktag.eosc;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.*;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.TAGGING_TRUST;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import javax.print.attribute.DocAttributeSet;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bulktag.SparkBulkTagJob;
import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
/**
* @author miriam.baglioni
* @Date 21/07/22
*/
public class SparkEoscBulkTag implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkEoscBulkTag.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String datasourceMapPath = parser.get("datasourceMapPath");
log.info("datasourceMapPath: {}", datasourceMapPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
CommunityConfiguration cc;
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, workingPath);
execBulkTag(spark, inputPath, workingPath, datasourceMapPath, resultClazz);
});
}
private static <R extends Result> void execBulkTag(
SparkSession spark,
String inputPath,
String workingPath,
String datasourceMapPath,
Class<R> resultClazz) {
List<String> hostedByList = readPath(spark, datasourceMapPath, DatasourceMaster.class)
.map((MapFunction<DatasourceMaster, String>) dm -> dm.getMaster(), Encoders.STRING())
.collectAsList();
readPath(spark, inputPath, resultClazz)
.map(patchResult(), Encoders.bean(resultClazz))
.filter(Objects::nonNull)
.map(
(MapFunction<R, R>) value -> enrich(value, hostedByList),
Encoders.bean(resultClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
readPath(spark, workingPath, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
}
private static <R extends Result> R enrich(R value, List<String> hostedByList) {
if (value
.getInstance()
.stream()
.anyMatch(
i -> (hostedByList.contains(i.getHostedby().getKey())) ||
(value.getEoscifguidelines() != null && value.getEoscifguidelines().size() > 0))
&&
!value.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) {
Context context = new Context();
context.setId("eosc");
context
.setDataInfo(
Arrays
.asList(
OafMapperUtils
.dataInfo(
false, BULKTAG_DATA_INFO_TYPE, true, false,
OafMapperUtils
.qualifier(
CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE,
DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS),
TAGGING_TRUST)));
value.getContext().add(context);
}
return value;
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
// TODO remove this hack as soon as the values fixed by this method will be provided as NON null
private static <R extends Result> MapFunction<R, R> patchResult() {
return r -> {
if (r.getDataInfo().getDeletedbyinference() == null) {
r.getDataInfo().setDeletedbyinference(false);
}
if (r.getContext() == null) {
r.setContext(new ArrayList<>());
}
return r;
};
}
}

View File

@ -1,13 +1,10 @@
package eu.dnetlib.dhp.bulktag; package eu.dnetlib.dhp.bulktag.eosc;
import static eu.dnetlib.dhp.PropagationConstant.readPath; import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*; import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -21,35 +18,15 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class SparkEoscTag { public class SparkEoscTag {
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils public static final String EOSC_GALAXY_WORKFLOW = "EOSC::Galaxy Workflow";
.qualifier( public static final String EOSC_TWITTER_DATA = "EOSC::Twitter Data";
"EOSC", public static final String EOSC_JUPYTER_NOTEBOOK = "EOSC::Jupyter Notebook";
"European Open Science Cloud", public static final String COMPLIES_WITH = "compliesWith";
ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
public static final DataInfo EOSC_DATAINFO = OafMapperUtils
.dataInfo(
false, "propagation", true, false,
OafMapperUtils
.qualifier(
"propagation:subject", "Inferred by OpenAIRE",
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.9");
public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
.structuredProperty(
"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
.structuredProperty(
"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
.structuredProperty(
"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
@ -84,29 +61,35 @@ public class SparkEoscTag {
}); });
} }
public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) {
EoscIfGuidelines eig = new EoscIfGuidelines();
eig.setCode(code);
eig.setLabel(label);
eig.setUrl(url);
eig.setSemanticRelation(semantics);
return eig;
}
private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) {
readPath(spark, inputPath + "/software", Software.class) readPath(spark, inputPath + "/software", Software.class)
.map((MapFunction<Software, Software>) s -> { .map((MapFunction<Software, Software>) s -> {
List<StructuredProperty> sbject;
if (!Optional.ofNullable(s.getSubject()).isPresent())
s.setSubject(new ArrayList<>());
sbject = s.getSubject();
if (containsCriteriaNotebook(s)) { if (containsCriteriaNotebook(s)) {
sbject.add(EOSC_NOTEBOOK); if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent())
if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) { s.setEoscifguidelines(new ArrayList<>());
sbject = sbject.stream().map(sb -> { addEIG(
if (sb.getValue().equals("EOSC Jupyter Notebook")) { s.getEoscifguidelines(), EOSC_JUPYTER_NOTEBOOK, EOSC_JUPYTER_NOTEBOOK, "",
return null; COMPLIES_WITH);
}
return sb;
}).filter(Objects::nonNull).collect(Collectors.toList());
s.setSubject(sbject);
}
} }
if (containsCriteriaGalaxy(s)) { if (containsCriteriaGalaxy(s)) {
sbject.add(EOSC_GALAXY); if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent())
s.setEoscifguidelines(new ArrayList<>());
addEIG(
s.getEoscifguidelines(), EOSC_GALAXY_WORKFLOW, EOSC_GALAXY_WORKFLOW, "", COMPLIES_WITH);
} }
return s; return s;
}, Encoders.bean(Software.class)) }, Encoders.bean(Software.class))
@ -123,15 +106,17 @@ public class SparkEoscTag {
readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> { .map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
List<StructuredProperty> sbject;
if (!Optional.ofNullable(orp.getSubject()).isPresent()) if (!Optional.ofNullable(orp.getEoscifguidelines()).isPresent())
orp.setSubject(new ArrayList<>()); orp.setEoscifguidelines(new ArrayList<>());
sbject = orp.getSubject();
if (containsCriteriaGalaxy(orp)) { if (containsCriteriaGalaxy(orp)) {
sbject.add(EOSC_GALAXY); addEIG(
orp.getEoscifguidelines(), EOSC_GALAXY_WORKFLOW, EOSC_GALAXY_WORKFLOW, "",
COMPLIES_WITH);
} }
if (containscriteriaTwitter(orp)) { if (containscriteriaTwitter(orp)) {
sbject.add(EOSC_TWITTER); addEIG(orp.getEoscifguidelines(), EOSC_TWITTER_DATA, EOSC_TWITTER_DATA, "", COMPLIES_WITH);
} }
return orp; return orp;
}, Encoders.bean(OtherResearchProduct.class)) }, Encoders.bean(OtherResearchProduct.class))
@ -148,12 +133,11 @@ public class SparkEoscTag {
readPath(spark, inputPath + "/dataset", Dataset.class) readPath(spark, inputPath + "/dataset", Dataset.class)
.map((MapFunction<Dataset, Dataset>) d -> { .map((MapFunction<Dataset, Dataset>) d -> {
List<StructuredProperty> sbject;
if (!Optional.ofNullable(d.getSubject()).isPresent()) if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent())
d.setSubject(new ArrayList<>()); d.setEoscifguidelines(new ArrayList<>());
sbject = d.getSubject();
if (containscriteriaTwitter(d)) { if (containscriteriaTwitter(d)) {
sbject.add(EOSC_TWITTER); addEIG(d.getEoscifguidelines(), EOSC_TWITTER_DATA, EOSC_TWITTER_DATA, "", COMPLIES_WITH);
} }
return d; return d;
}, Encoders.bean(Dataset.class)) }, Encoders.bean(Dataset.class))
@ -169,6 +153,12 @@ public class SparkEoscTag {
.json(inputPath + "/dataset"); .json(inputPath + "/dataset");
} }
private static void addEIG(List<EoscIfGuidelines> eoscifguidelines, String code, String label, String url,
String sem) {
if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code)))
eoscifguidelines.add(newInstance(code, label, url, sem));
}
private static boolean containscriteriaTwitter(Result r) { private static boolean containscriteriaTwitter(Result r) {
Set<String> words = getWordsSP(r.getTitle()); Set<String> words = getWordsSP(r.getTitle());
words.addAll(getWordsF(r.getDescription())); words.addAll(getWordsF(r.getDescription()));
@ -177,10 +167,12 @@ public class SparkEoscTag {
(words.contains("data") || words.contains("dataset"))) (words.contains("data") || words.contains("dataset")))
return true; return true;
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) && return Optional
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data"))) .ofNullable(r.getSubject())
return true; .map(
return false; s -> s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
.orElse(false);
} }
private static boolean containsCriteriaGalaxy(Result r) { private static boolean containsCriteriaGalaxy(Result r) {
@ -190,13 +182,17 @@ public class SparkEoscTag {
words.contains("workflow")) words.contains("workflow"))
return true; return true;
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) && return Optional
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) .ofNullable(r.getSubject())
return true; .map(
return false; s -> s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow")))
.orElse(false);
} }
private static boolean containsCriteriaNotebook(Software s) { private static boolean containsCriteriaNotebook(Software s) {
if (!Optional.ofNullable(s.getSubject()).isPresent())
return false;
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter"))) if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
return true; return true;
if (s if (s
@ -212,13 +208,6 @@ public class SparkEoscTag {
return false; return false;
} }
private static Set<String> getSubjects(List<StructuredProperty> s) {
Set<String> subjects = new HashSet<>();
s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
return subjects;
}
private static Set<String> getWordsSP(List<StructuredProperty> elem) { private static Set<String> getWordsSP(List<StructuredProperty> elem) {
Set<String> words = new HashSet<>(); Set<String> words = new HashSet<>();
Optional Optional
@ -242,10 +231,7 @@ public class SparkEoscTag {
t -> words t -> words
.addAll( .addAll(
Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))))); Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
// elem
// .forEach(
// t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
return words;
return words;
} }
} }

View File

@ -0,0 +1,32 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the name node on hdfs",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,34 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "dmp",
"paramLongName":"datasourceMapPath",
"paramDescription": "the path where the association datasource master has been stored",
"paramRequired": true
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -16,6 +16,21 @@
<name>outputPath</name> <name>outputPath</name>
<description>the output path</description> <description>the output path</description>
</property> </property>
<property>
<name>postgresURL</name>
<description>the url of the postgress server to query</description>
</property>
<property>
<name>postgresUser</name>
<description>the username to access the postgres db</description>
</property>
<property>
<name>postgresPassword</name>
<description>the postgres password</description>
</property>
</parameters> </parameters>
<global> <global>
@ -211,7 +226,7 @@
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>EOSC_tagging</name> <name>EOSC_tagging</name>
<class>eu.dnetlib.dhp.bulktag.SparkEoscTag</class> <class>eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar> <jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--num-executors=${sparkExecutorNumber} --num-executors=${sparkExecutorNumber}
@ -226,10 +241,132 @@
<arg>--sourcePath</arg><arg>${outputPath}</arg> <arg>--sourcePath</arg><arg>${outputPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg> <arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg>
</spark> </spark>
<ok to="End"/> <ok to="eosc_get_datasource_master"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="eosc_get_datasource_master">
<java>
<main-class>eu.dnetlib.dhp.bulktag.eosc.ReadMasterDatasourceFromDB</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/datasourcemaster</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
</java>
<ok to="fork_eosc_context_tag"/>
<error to="Kill"/>
</action>
<fork name="fork_eosc_context_tag">
<path start="eosc_context_tag_publication"/>
<path start="eosc_context_tag_dataset"/>
<path start="eosc_context_tag_otherresearchproduct"/>
<path start="eosc_context_tag_software"/>
</fork>
<action name="eosc_context_tag_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>EOSC_tagging</name>
<class>eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
<ok to="wait_eosc_context_tag"/>
<error to="Kill"/>
</action>
<action name="eosc_context_tag_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>EOSC_tagging</name>
<class>eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
<ok to="wait_eosc_context_tag"/>
<error to="Kill"/>
</action>
<action name="eosc_context_tag_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>EOSC_tagging</name>
<class>eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/software</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
<ok to="wait_eosc_context_tag"/>
<error to="Kill"/>
</action>
<action name="eosc_context_tag_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>EOSC_tagging</name>
<class>eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--num-executors=${sparkExecutorNumber}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
<ok to="wait_eosc_context_tag"/>
<error to="Kill"/>
</action>
<join name="wait_eosc_context_tag" to="End"/>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -0,0 +1,162 @@
package eu.dnetlib.dhp.bulktag;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author miriam.baglioni
* @Date 22/07/22
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
//"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea" has instance hostedby eosc
//"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1" has instance hostedby eosc
//"50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7" has two instance one hostedby eosc
//"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb" has EoscTag
public class EOSCContextTaggingTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(EOSCContextTaggingTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(EOSCContextTaggingTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(EOSCContextTaggingTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(EOSCTagJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void EoscContextTagTest() throws Exception {
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
SparkEoscBulkTag
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input/dataset",
"-workingPath", workingDir.toString() + "/working/dataset",
"-datasourceMapPath",
getClass()
.getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster")
.getPath(),
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
Assertions
.assertEquals(
4,
tmp
.filter(
s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
Assertions
.assertEquals(
1,
tmp
.filter(
d -> d.getId().equals("50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea")
&&
d.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
Assertions
.assertEquals(
1,
tmp
.filter(
d -> d.getId().equals("50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1")
&&
d.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
Assertions
.assertEquals(
1,
tmp
.filter(
d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb")
&&
d.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
Assertions
.assertEquals(
1,
tmp
.filter(
d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb")
&&
d.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
}
}

View File

@ -1,21 +1,17 @@
package eu.dnetlib.dhp.bulktag; package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.List; import java.util.List;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterAll;
@ -27,6 +23,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
public class EOSCTagJobTest { public class EOSCTagJobTest {
@ -126,10 +123,23 @@ public class EOSCTagJobTest {
.assertEquals( .assertEquals(
4, 4,
tmp tmp
.filter(s -> s.getEoscifguidelines() != null)
.filter( .filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
.count()); .count());
Assertions
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions Assertions
.assertEquals( .assertEquals(
1, tmp 1, tmp
@ -140,6 +150,16 @@ public class EOSCTagJobTest {
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
Assertions
.assertFalse(
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect() .collect()
@ -166,16 +186,24 @@ public class EOSCTagJobTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getEoscifguidelines() == null);
Assertions Assertions
.assertEquals( .assertEquals(
9, tmp 8, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
Assertions Assertions
.assertTrue( .assertFalse(
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect() .collect()
@ -183,6 +211,23 @@ public class EOSCTagJobTest {
.getSubject() .getSubject()
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
Assertions Assertions
.assertEquals( .assertEquals(
@ -201,17 +246,24 @@ public class EOSCTagJobTest {
.getSubject() .getSubject()
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect()
.get(0)
.getEoscifguidelines() == null);
Assertions Assertions
.assertEquals( .assertEquals(
9, tmp 8, tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
Assertions Assertions
.assertTrue( .assertFalse(
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect() .collect()
@ -219,14 +271,31 @@ public class EOSCTagJobTest {
.getSubject() .getSubject()
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
1,
tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp List<StructuredProperty> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
.collect() .collect()
.get(0) .get(0)
.getSubject(); .getSubject();
Assertions.assertEquals(8, subjects.size()); Assertions.assertEquals(7, subjects.size());
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire")));
@ -250,6 +319,17 @@ public class EOSCTagJobTest {
.filter( .filter(
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count()); .count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.filter(
ds -> ds
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
.count());
Assertions Assertions
.assertEquals( .assertEquals(
@ -264,7 +344,22 @@ public class EOSCTagJobTest {
.textFile(workingDir.toString() + "/input/otherresearchproduct") .textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter( .filter(
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) orp -> orp
.getSubject()
.stream()
.anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter(
orp -> orp
.getSubject()
.stream()
.anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook")))
.count()); .count());
// spark.stop(); // spark.stop();
@ -326,22 +421,41 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
1, 0,
tmp tmp
.filter( .filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count()); .count());
Assertions
.assertEquals(
1,
tmp
.filter(
s -> s.getEoscifguidelines() != null)
.count());
Assertions
.assertEquals(
1,
tmp
.filter(
s -> s.getEoscifguidelines() != null)
.filter(
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")))
.count());
Assertions Assertions
.assertEquals( .assertEquals(
2, tmp 1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
Assertions Assertions
.assertTrue( .assertFalse(
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect() .collect()
@ -350,6 +464,24 @@ public class EOSCTagJobTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")));
Assertions Assertions
.assertEquals( .assertEquals(
5, tmp 5, tmp
@ -385,22 +517,34 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
1, 0,
orp orp
.filter( .filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count()); .count());
orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o)));
Assertions Assertions
.assertEquals( .assertEquals(
3, orp 1, orp
.filter(o -> o.getEoscifguidelines() != null)
.filter(
o -> o
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")))
.count());
Assertions
.assertEquals(
2, orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
Assertions Assertions
.assertTrue( .assertFalse(
orp orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect() .collect()
@ -408,6 +552,23 @@ public class EOSCTagJobTest {
.getSubject() .getSubject()
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
1, orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow")));
Assertions Assertions
.assertEquals( .assertEquals(
@ -516,10 +677,20 @@ public class EOSCTagJobTest {
Assertions Assertions
.assertEquals( .assertEquals(
3, 0,
orp orp
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count()); .count());
Assertions
.assertEquals(
3,
orp
.filter(
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.count());
JavaRDD<Dataset> dats = sc JavaRDD<Dataset> dats = sc
.textFile(workingDir.toString() + "/input/dataset") .textFile(workingDir.toString() + "/input/dataset")
@ -531,7 +702,11 @@ public class EOSCTagJobTest {
.assertEquals( .assertEquals(
3, 3,
dats dats
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .filter(
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.count()); .count());
} }

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,318 @@
{"datasource":"eosc________::100percentit::100percentit.100_percent_it_trusted_cloud","master":"10|eosc________::7ef2576047f040612b983a27347471fc"}
{"datasource":"eosc________::altec::altec.space-vis_adn_service","master":"10|eosc________::2946c48bbcc514ad76bbbf727d5d8fbc"}
{"datasource":"eosc________::astron::astron.","master":"10|eosc________::acb262d4bfdeb6aa9b463a4a6d0d662a"}
{"datasource":"eosc________::athena::athena.atmo-flud","master":"10|eosc________::ac448975e1d7f8b0266c8bb3b3992029"}
{"datasource":"eosc________::athena::athena.uw-map","master":"10|eosc________::5f2a401cf8ce9dc22a3776cea519b594"}
{"datasource":"eosc________::athena::athena.verbal_aggression_analyser_va_analyser","master":"10|eosc________::8b26233e89a50e3754972b1341130494"}
{"datasource":"eosc________::authenix::authenix.authenix","master":"10|eosc________::3cd84764da5728473593a580efb29a40"}
{"datasource":"eosc________::bineo::bineo.cos4bio","master":"10|eosc________::903e0526a6e56eeaf0e4561aa862ecb8"}
{"datasource":"eosc________::blue-cloud::blue-cloud.phytoplankton_eovs","master":"10|eosc________::c2438d79b48baf817956f3856877b3b8"}
{"datasource":"eosc________::bsc-es::bsc-es.bdrc_-_barcelona_dust_regional_center","master":"10|eosc________::756664ca614118315840eb8e985e4377"}
{"datasource":"eosc________::bsc-es::bsc-es.openebench","master":"10|eosc________::69ed72b873b803feed5ba6ae47548419"}
{"datasource":"eosc________::capsh::capsh.dissemin","master":"10|eosc________::e81587742e4107ce83723df17c27cb40"}
{"datasource":"eosc________::carlzeissm::carlzeissm.aper","master":"10|eosc________::f3beb9ee5ee293b723e2edd6f990fde3"}
{"datasource":"eosc________::ccsd::ccsd.episciences","master":"10|eosc________::e1e9de0dbf4bce79c49338d7cf9327e2"}
{"datasource":"eosc________::cds::cds.simbad_simbad_astronomical_database_provides_basic_data_cross-identifications_bibliography_and_measurements_for_astronomical_objects_outside_the_solar_system","master":"10|eosc________::a1e41e71453ac32161f4ac3f5c0e0421"}
{"datasource":"eosc________::centerdata::centerdata.surveycodingsorg","master":"10|eosc________::72db73ab253727c889905da50f506d10"}
{"datasource":"eosc________::cesga::cesga.finisterrae","master":"10|eosc________::6af4303d93f72744cc4c3c815ed2c9a0"}
{"datasource":"eosc________::cesnet::cesnet.metacentrum_cloud","master":"10|eosc________::cebfaa2d0b93502d56a8fbeb6b66cfbe"}
{"datasource":"eosc________::cesnet::cesnet.object_based_storage","master":"10|eosc________::1c5b55339bb86ff997a256d42d7be4b0"}
{"datasource":"eosc________::cesnet::cesnet.umsa_-_untargeted_mass_spectrometry_data_analysis","master":"10|eosc________::d928868211759352cb1604713e0347ec"}
{"datasource":"eosc________::cessda-eric::cessda-eric.cessda_data_catalogue","master":"10|fairsharing_::936824c0191953647ec609b4f49bc964"}
{"datasource":"eosc________::cessda-eric::cessda-eric.data_management_expert_guide_dmeg","master":"10|eosc________::22c14aaf31fc64424fa97adffe6380b9"}
{"datasource":"eosc________::cessda-eric::cessda-eric.elsst__european_language_social_science_thesaurus","master":"10|eosc________::5b30e057381cf0200dc2cdc7b562f570"}
{"datasource":"eosc________::cines::cines.etdr","master":"10|eosc________::3b7f7d6aafb0154025330183d59ce670"}
{"datasource":"eosc________::clarin-eric::clarin-eric.language_resource_switchboard","master":"10|eosc________::3531aa80dbe2b1018133b510a933de40"}
{"datasource":"eosc________::clarin-eric::clarin-eric.virtual_collection_registry","master":"10|eosc________::454e4f7f9f53d9dacf9dc3ba27902f16"}
{"datasource":"eosc________::clarin-eric::clarin-eric.virtual_language_observatory","master":"10|eosc________::4db0c877190783461728c6714cb66cbc"}
{"datasource":"eosc________::cloudferro::cloudferro.data_collections_catalog","master":"10|eosc________::eba1540eb9e87231fdf366eb23d16c3a"}
{"datasource":"eosc________::cloudferro::cloudferro.data_related_services_-_eo_browser","master":"10|eosc________::c24ebda20485c08293b72561ee3c634b"}
{"datasource":"eosc________::cloudferro::cloudferro.data_related_services_-_eo_finder","master":"10|eosc________::3d68186239b6c0f0d677ff55d9b549d1"}
{"datasource":"eosc________::cloudferro::cloudferro.infrastructure","master":"10|eosc________::ac7e3c0151fa3f11d3a7739dddaa3416"}
{"datasource":"eosc________::cmcc::cmcc.enes_data_space","master":"10|eosc________::2925e4df4147819e5b5d2f886f40e3a2"}
{"datasource":"eosc________::cnb-csic::cnb-csic.3dbionotes-ws_web_application_to_annotate_biochemical_and_biomedical_information_onto_structural_models","master":"10|eosc________::77fe0a66415f2440ab60d47dcee678a5"}
{"datasource":"eosc________::cnb-csic::cnb-csic.scipioncloud","master":"10|eosc________::7f09b7fee99363813f24aca9ebdecf61"}
{"datasource":"eosc________::cnr-iia::cnr-iia.geo_dab","master":"10|eosc________::108b0148352c15ee1ce935699e09add3"}
{"datasource":"eosc________::collabwith::collabwith.collabwith_marketplace","master":"10|eosc________::894a0ffa7768b228c1b46793670c85e6"}
{"datasource":"eosc________::coronis_computing_sl::coronis_computing_sl.uw-mos","master":"10|eosc________::9cbf0a75d817e291771b8bce6440f5f4"}
{"datasource":"eosc________::coronis_computing_sl::coronis_computing_sl.vd-maps","master":"10|eosc________::b5af1514b39d8e021554a73076a694d9"}
{"datasource":"eosc________::creaf::creaf.nimmbus_geospatial_user_feedback","master":"10|eosc________::86c325db16448760b3390dda7e46631a"}
{"datasource":"eosc________::creatis::creatis.virtual_imaging_platform","master":"10|eosc________::01a45ac2677f89414af91e651735846d"}
{"datasource":"eosc________::cs_group::cs_group.ai4geo_engine","master":"10|eosc________::c61211295d27e5e08f4c64f3e3098294"}
{"datasource":"eosc________::csc-fi::csc-fi.chipster","master":"10|eosc________::61549f785a2c93939be011b0453a6981"}
{"datasource":"eosc________::csc-fi::csc-fi.cpouta","master":"10|eosc________::d71c843b4e00eff17db07bf9d10769f9"}
{"datasource":"eosc________::csc-fi::csc-fi.csc_epouta","master":"10|eosc________::4493bd6a93e5b8465fda8cf7ab2dfdea"}
{"datasource":"eosc________::csc-fi::csc-fi.rahti_container_cloud","master":"10|eosc________::cc60eb9fc76f9598ee581eff0792573b"}
{"datasource":"eosc________::cscs::cscs.object_storage","master":"10|eosc________::3da6a817fe85ef43f7d97ef07e467d45"}
{"datasource":"eosc________::csi_piemonte::csi_piemonte.nivola2","master":"10|eosc________::ac6483be3e556c8652b8595680795983"}
{"datasource":"eosc________::csic::csic.csic_cloud_infrastructure","master":"10|eosc________::05ea2eb193382e22f32b32fbe9a4d961"}
{"datasource":"eosc________::cyberbotics::cyberbotics.robotbenchmark","master":"10|eosc________::27ee094c68b7a758ca2915aca6215a1d"}
{"datasource":"eosc________::d4science::d4science.alien_and_invasive_species_vre","master":"10|eosc________::b5cff6d55dcf6c20e78a0f1f847b3005"}
{"datasource":"eosc________::d4science::d4science.rprototypinglab_virtual_research_environment","master":"10|eosc________::8073ab0dbb22dc3b9f17627a7b25903f"}
{"datasource":"eosc________::d4science::d4science.visual_media_service_vre","master":"10|eosc________::eabf459f53c2bfe6247f006fcc0f4db7"}
{"datasource":"eosc________::dariah_eric::dariah_eric.dariah-campus","master":"10|eosc________::9c63075d6642a2d269776c2b90c2f976"}
{"datasource":"eosc________::dariah_eric::dariah_eric.ssh_open_marketplace","master":"10|eosc________::91fe494a3c21805febb03353152f1212"}
{"datasource":"eosc________::datacite::datacite.datacite_doi_registration_service","master":"10|eosc________::c146a470f01ee7ded3b55acda9362e7f"}
{"datasource":"eosc________::dcc-uk::dcc-uk.dmponline","master":"10|eosc________::fe480090e0739dab86b24a11177eeffd"}
{"datasource":"eosc________::denbi::denbi.cloud","master":"10|eosc________::59399e560967488c0ae0329e0d37f5b4"}
{"datasource":"eosc________::desy::desy.pan_data","master":"10|eosc________::52008fe404bf2e939140109162f9233f"}
{"datasource":"eosc________::desy::desy.pan_faas","master":"10|eosc________::026939c4b12d7d71e2b05bc5acde804e"}
{"datasource":"eosc________::desy::desy.pan_gitlab","master":"10|eosc________::f13cefc9f3207cb82f3285b05f190f78"}
{"datasource":"eosc________::desy::desy.pan_notebook","master":"10|eosc________::500fe61cce6562797cd43797aab12be5"}
{"datasource":"eosc________::digitalglobe::digitalglobe.earthwatch","master":"10|eosc________::020d905260267066c1926f526bb86f30"}
{"datasource":"eosc________::dkrz::dkrz.enes_climate_analytics_service","master":"10|eosc________::1d7a1fea6694d15d9e67f08e1e77082b"}
{"datasource":"eosc________::doabf::doabf.operas_certification","master":"10|eosc________::79b9748edeffb872a28660a9d238dcec"}
{"datasource":"eosc________::ds-wizard::ds-wizard.data_stewardship_wizard","master":"10|eosc________::fc6bad963e15e218efc62c7befd122af"}
{"datasource":"eosc________::egi-fed::egi-fed.check-in","master":"10|eosc________::baa3c497b9499b3d8c87ea8d2b37a44f"}
{"datasource":"eosc________::egi-fed::egi-fed.cloud_compute","master":"10|eosc________::b1179384a336d409fc909fe3711d3d1f"}
{"datasource":"eosc________::egi-fed::egi-fed.cloud_container_compute","master":"10|eosc________::a66bb1ac56a3bcf2c24b0ef85ed2bdfc"}
{"datasource":"eosc________::egi-fed::egi-fed.data_transfer","master":"10|eosc________::6c0bf38e885c42161b88093517f6cd3e"}
{"datasource":"eosc________::egi-fed::egi-fed.egi_datahub","master":"10|eosc________::5a260dae80795584ac08df133adb1fad"}
{"datasource":"eosc________::egi-fed::egi-fed.fitsm_training","master":"10|eosc________::927b4455c0a21692d2a9f634bccd8309"}
{"datasource":"eosc________::egi-fed::egi-fed.high-throughput_compute","master":"10|eosc________::e27ec11ac7b7d6ffbbce668b7d1f81d5"}
{"datasource":"eosc________::egi-fed::egi-fed.iso_27001_training","master":"10|eosc________::98a6655b6421166c5c29baa2f5815de3"}
{"datasource":"eosc________::egi-fed::egi-fed.notebook","master":"10|eosc________::1d37909a6a31147a09ee9f2e579a6706"}
{"datasource":"eosc________::egi-fed::egi-fed.online_storage","master":"10|eosc________::d8b94284582d3e2185a782ae2ba42186"}
{"datasource":"eosc________::egi-fed::egi-fed.training_infrastructure","master":"10|eosc________::38cdb8e44638f2e561c466f0dd26cf96"}
{"datasource":"eosc________::egi-fed::egi-fed.workload_manager","master":"10|eosc________::ff515071cd88afb40599edcb6637f47e"}
{"datasource":"eosc________::ehri::ehri.begrenzte_flucht","master":"10|eosc________::01d1445605fc1d25e6a7f21ba995d724"}
{"datasource":"eosc________::ehri::ehri.diplomatic_reports","master":"10|eosc________::11714353d2ed069ca30b177d4b4d9e0f"}
{"datasource":"eosc________::ehri::ehri.early_holocaust_testimony","master":"10|eosc________::0a4974b0bb295b98f88cb7c793f91c17"}
{"datasource":"eosc________::ehri::ehri.ehri_document_blog","master":"10|eosc________::fb9291f8dac099986eafe957b169ed97"}
{"datasource":"eosc________::ehri::ehri.international_research_portal_for_records_related_to_nazi-era_cultural_property","master":"10|eosc________::01c5b10e57f9cbb4f3125f427375487e"}
{"datasource":"eosc________::ehri::ehri.the_ehri_portal","master":"10|eosc________::6ad4d5352fd192b5fecd76bbd7a7e8b7"}
{"datasource":"eosc________::eiscat::eiscat.eiscat_data_access_portal","master":"10|eosc________::0f06a55c8333ae4d197c1d263b2be6ba"}
{"datasource":"eosc________::elixir-italy::elixir-italy.laniakea_recas","master":"10|eosc________::01e84abe377339ea57ed521ac39130e9"}
{"datasource":"eosc________::elixir-uk::elixir-uk.cyverse_uk","master":"10|eosc________::6a6a05847befec6587bef7673112f5e5"}
{"datasource":"eosc________::elixir-uk::elixir-uk.workflowhub","master":"10|fairsharing_::c8cd63e1bf13c5016881652983fb615a"}
{"datasource":"eosc________::elsevier::elsevier.digital_commons","master":"10|eosc________::67d38b6a1f43184676b113369554676b"}
{"datasource":"eosc________::embl-ebi::embl-ebi.embassy_cloud","master":"10|eosc________::7f8b24797312b851916ee1be0f836de6"}
{"datasource":"eosc________::embl-ebi::embl-ebi.identifiersorg","master":"10|eosc________::564e9f467aad251143e12e2e6ec19768"}
{"datasource":"eosc________::embl-ebi::embl-ebi.identifiersorg_central_registry","master":"10|eosc________::441caf7eaa4a6602aceae36b2697b924"}
{"datasource":"eosc________::embl-ebi::embl-ebi.identifiersorg_resolution_services","master":"10|eosc________::8df6273a1cb2289dbbe3a4b5fe05aa53"}
{"datasource":"eosc________::emso_eric::emso_eric.emso_eric_data_portal","master":"10|eosc________::94a41630bd9ddea4a88ec0bfba1b9d95"}
{"datasource":"eosc________::enermaps::enermaps.enermaps_data_management_tool","master":"10|eosc________::11496ee8a69b4b955200da7f2c12fe3b"}
{"datasource":"eosc________::enhancer::enhancer.openrdmeu","master":"10|eosc________::04820bece2545235144903dec056bcbd"}
{"datasource":"eosc________::enhancer::enhancer.swiss_escience_grid_certificates","master":"10|eosc________::4968516eb3b1ad6d883e74a84827e963"}
{"datasource":"eosc________::eodc::eodc.data_catalogue_service","master":"10|eosc________::21c44a2b6946e02300dbe36a8edec650"}
{"datasource":"eosc________::eodc::eodc.jupyterhub_for_global_copernicus_data","master":"10|eosc________::f99ccd68bf3de6a0a3b0db3441a41bbd"}
{"datasource":"eosc________::eosc-dih::eosc-dih.piloting_and_co-design_of_the_business_pilots","master":"10|eosc________::178f3e4832afe9e477d761d2f3d95f85"}
{"datasource":"eosc________::eox::eox.edc_eoxhub_workspace","master":"10|eosc________::d71468878e069cf484fc988d276c6d9a"}
{"datasource":"eosc________::esa-int::esa-int.geoss_web_portal","master":"10|eosc________::d7bac1ce234c20e3ab43a74eefa34782"}
{"datasource":"eosc________::esrf::esrf.the_european_synchrotron_radiation_facility_data_portal","master":"10|fairsharing_::2996962656838a97af4c5f926fe6f1b0"}
{"datasource":"eosc________::ess::ess.pan-learning-org","master":"10|eosc________::1298286d3a7cc48fa525b118218c7836"}
{"datasource":"eosc________::ess_eric::ess_eric.european_social_survey_ess_as_a_service","master":"10|eosc________::faa60b95b602690861be9305812a5c07"}
{"datasource":"eosc________::eudat::eudat.b2access","master":"10|eosc________::4dee0695b946b545dc8d52c56598fbbf"}
{"datasource":"eosc________::eudat::eudat.b2drop","master":"10|eosc________::4c6a514f1392ac1d159214e61785849a"}
{"datasource":"eosc________::eudat::eudat.b2find","master":"10|eosc________::6069f46dfcc89ccf8043581c9034558e"}
{"datasource":"eosc________::eudat::eudat.b2handle","master":"10|eosc________::a23be7f6265fd1ad957eed16b5c8bdc4"}
{"datasource":"eosc________::eudat::eudat.b2note","master":"10|eosc________::dfd1d6816b4182e25e84f6cf10d108ed"}
{"datasource":"eosc________::eudat::eudat.b2safe","master":"10|re3data_____::a632666349a0bb9a36096c9e152d34cc"}
{"datasource":"eosc________::eudat::eudat.b2share","master":"10|eosc________::f959324bdb00f052d547b95da205062f"}
{"datasource":"eosc________::eurac::eurac.edp-portal_-_metadata_catalogue_of_eurac_research","master":"10|eosc________::274d73061a925a29d8743b3e1022d0dc"}
{"datasource":"eosc________::europeana::europeana.europeana_apis","master":"10|eosc________::91de8c90ebde3dc1c8d41f339fe3fac7"}
{"datasource":"eosc________::exoscale::exoscale.european_cloud_hosting","master":"10|eosc________::12b7e6fef784084b817a42f2990fe3f2"}
{"datasource":"eosc________::expertai::expertai.document_enrichment_api","master":"10|eosc________::6812b902471f12506c8e6441195aff57"}
{"datasource":"eosc________::expertai::expertai.recommender_api","master":"10|eosc________::c40634543c1217686f0a8f5e8592d100"}
{"datasource":"eosc________::expertai::expertai.search_api","master":"10|eosc________::79440bc8082949f56cbabef796cec7f1"}
{"datasource":"eosc________::fairdi::fairdi.nomad_repository","master":"10|eosc________::b9000c95a6fde9930ae74f4071e14cb2"}
{"datasource":"eosc________::figshare::figshare.figshare","master":"10|eosc________::5e6bd062c6b85e2d176b2e61636b8971"}
{"datasource":"eosc________::forschungsdaten::forschungsdaten.forschungsdateninfo","master":"10|eosc________::c9185fdb68af7d515e56054da546bc94"}
{"datasource":"eosc________::forth::forth.openbioeu","master":"10|eosc________::2db71171816e994877fb960b9fcd89f2"}
{"datasource":"eosc________::fssda::fssda.data_service_portal_aila","master":"10|eosc________::ef1f75ea6d244563bc6cfb0c3d3affa4"}
{"datasource":"eosc________::fssda::fssda.kuha2_metadata_server","master":"10|eosc________::b6af28d7c292dbbe816cd0d6a9a66f16"}
{"datasource":"eosc________::gbif-es::gbif-es.collections_registry","master":"10|eosc________::ac6da0cfbd07f8605c57a799c41dc947"}
{"datasource":"eosc________::gbif-es::gbif-es.e-Learning_platform","master":"10|eosc________::9059ca88ca8292881ffba9ad8d943d04"}
{"datasource":"eosc________::gbif-es::gbif-es.images_portal","master":"10|eosc________::6991e5dd230956156129669934798cd8"}
{"datasource":"eosc________::gbif-es::gbif-es.occurrence_records","master":"10|eosc________::948a9a53e2a9c94d32f99785eccff662"}
{"datasource":"eosc________::gbif-es::gbif-es.regions_module","master":"10|eosc________::11189c308854c8d8113161edc7fbd3de"}
{"datasource":"eosc________::gbif-es::gbif-es.spatial_portal","master":"10|eosc________::665f73f5e4b6a3693fec9426a6ce6ae8"}
{"datasource":"eosc________::gbif-es::gbif-es.species_portal","master":"10|eosc________::9fe2f2ccb3d17452bd6e7424f60340ce"}
{"datasource":"eosc________::gbif::gbif.gbif_species_occurrence_data","master":"10|fairsharing_::6e5025ccc7d638ae4e724da8938450a6"}
{"datasource":"eosc________::gbif_portugal::gbif_portugal.gbif_portugal_occurrence_records","master":"10|eosc________::fcd4f4efdecb4e675fdee043043f69fc"}
{"datasource":"eosc________::gcc_umcg::gcc_umcg.molgenis","master":"10|eosc________::7f255ebbb3715f258e8d7c470209e675"}
{"datasource":"eosc________::geant::geant.clouds_service_infrastructure_as_a_service","master":"10|eosc________::7debc69506a8019515d350707e8c82d7"}
{"datasource":"eosc________::geant::geant.edugain","master":"10|eosc________::3ded12106e7e870242f7ec39345b3b97"}
{"datasource":"eosc________::geant::geant.edumeet_-_webbased_videoconferencing_platform","master":"10|eosc________::dcf8b262f7f61d44eedf409a29d30abc"}
{"datasource":"eosc________::geant::geant.eduroam","master":"10|eosc________::e7fd04aab1f224aaa2b5d3478694748b"}
{"datasource":"eosc________::geant::geant.eduteams","master":"10|eosc________::f3b04fa1e741f17a842fcbea35e04318"}
{"datasource":"eosc________::geant::geant.eduvpn_-_access_your_institutes_network_or_the_internet_using_an_encrypted_connection","master":"10|eosc________::aeb7c573f2742ec5ef8b7332b6b614cb"}
{"datasource":"eosc________::geant::geant.inacademia","master":"10|eosc________::26cb3be539a5bbb25533d3b1bdb9d6aa"}
{"datasource":"eosc________::geant::geant.ip","master":"10|eosc________::59cd8dbce2703f4eea69a54a959aae89"}
{"datasource":"eosc________::geant::geant.l3vpn","master":"10|eosc________::1e70cff61071ce42baffa6dafaf3165e"}
{"datasource":"eosc________::geant::geant.lambda","master":"10|eosc________::20a8114b376bf4c455c034b7b4513805"}
{"datasource":"eosc________::geant::geant.mdvpn","master":"10|eosc________::54fbf0ac4e42a2ce51e400d9783b51ba"}
{"datasource":"eosc________::geant::geant.open","master":"10|eosc________::9ae24d8c63e9ff986fbd20705b334919"}
{"datasource":"eosc________::geant::geant.perfsonar","master":"10|eosc________::1bdda4f743377914fabd0f365a8b6ee2"}
{"datasource":"eosc________::geant::geant.plus","master":"10|eosc________::eef45e860d52aff4932f254599d5b713"}
{"datasource":"eosc________::geant::geant.transits_training","master":"10|eosc________::831e2b596060c60d7d4bc79c200a2254"}
{"datasource":"eosc________::geant::geant.trusted_certificate_service","master":"10|eosc________::30817adfb6c625d7fd36b657e2fabc74"}
{"datasource":"eosc________::geant::geant.wifimon","master":"10|eosc________::6116f3b14f34658593529f6810068c4e"}
{"datasource":"eosc________::genias::genias.e-irg_knowledge_base","master":"10|eosc________::ddc5ab67fed353917716eb2d5c86ce68"}
{"datasource":"eosc________::gesis::gesis.doi_registration_service","master":"10|eosc________::71f37a7ebd8495a59c46e637ee5463da"}
{"datasource":"eosc________::grnet::grnet.agora_resource_portfolio_management_tool","master":"10|eosc________::461aa754c52b7eed605f9e0955470de5"}
{"datasource":"eosc________::grnet::grnet.argo_monitoring_engine","master":"10|eosc________::e91a3b4dfb62113b9b67b0ac97e566b4"}
{"datasource":"eosc________::grnet::grnet.aris","master":"10|eosc________::6b381464ec768e3cf55ccacdb00b5988"}
{"datasource":"eosc________::grnet::grnet.aris_-_archival_service","master":"10|eosc________::32158f91e33cf6fb6c63561cbc7ffd24"}
{"datasource":"eosc________::grnet::grnet.ni4os-europe_login","master":"10|eosc________::aeaa8f7fc2948930bfa4f970cd96837e"}
{"datasource":"eosc________::grnet::grnet.ni4os-europe_repository_service","master":"10|eosc________::d6933cb7acd6fa7a2f7a42562c432fb5"}
{"datasource":"eosc________::grycap::grycap.elastic_cloud_compute_cluster","master":"10|eosc________::c6d3c380ce5499d8d20cc9bbeb3b43ff"}
{"datasource":"eosc________::grycap::grycap.infrastructure_manager","master":"10|eosc________::e8a2eeb06a205c3299af49f5c233ce16"}
{"datasource":"eosc________::grycap::grycap.saps_surface_energy_balance_automated_processing_service","master":"10|eosc________::a7ae875b2487576c35f1bc8e1c857c14"}
{"datasource":"eosc________::hn::hn.isidore","master":"10|re3data_____::fabe5c1aaa2e2d4c847e01647b87bf60"}
{"datasource":"eosc________::hostkey::hostkey.gpu_servers_grant_program","master":"10|eosc________::d45f87107eb536b4be97e112fac15787"}
{"datasource":"eosc________::icos_eric::icos_eric.data_discovery_and_access_portal","master":"10|eosc________::84ada2e91828ce72fa6d02736cdd90f1"}
{"datasource":"eosc________::ifca-csic::ifca-csic.deepaas_training_facility","master":"10|eosc________::5414e2342e67d64b11b835e7fd58869d"}
{"datasource":"eosc________::ifca-csic::ifca-csic.ifca-csic_cloud_infrastructure","master":"10|eosc________::838e5c334e8115e4831d5f21435aa19b"}
{"datasource":"eosc________::ifca-csic::ifca-csic.plant_classification","master":"10|eosc________::32c26f83acaef8d89cc6c7a2f8abd198"}
{"datasource":"eosc________::ifca-csic::ifca-csic.remote_monitoring_and_smart_sensing","master":"10|eosc________::0335d29ec68ef9ebad8326cba79455f2"}
{"datasource":"eosc________::ifin-hh::ifin-hh.cloudifin","master":"10|eosc________::04d791df0b61b0f5060f241c70924991"}
{"datasource":"eosc________::iisas::iisas.dynamic_dns_service","master":"10|eosc________::2381e3b55d048130f2dffd437123d501"}
{"datasource":"eosc________::iisas::iisas.fedcloudclient_egi_fedcloud_client","master":"10|eosc________::3668885b6512a039673b9f4638c88600"}
{"datasource":"eosc________::iisas::iisas.modelling_service_for_water_supply_systems","master":"10|eosc________::b1d6d2cebddf52f6647102a30690fba9"}
{"datasource":"eosc________::ill::ill.ill_data_portal","master":"10|eosc________::714498cf1efec13c2206db4b1e4f1c30"}
{"datasource":"eosc________::ill::ill.panosc_software_catalogue","master":"10|eosc________::bc63c5a78abd38a7d9df043e0853a9ce"}
{"datasource":"eosc________::inaf::inaf.space-ml_caesar_service","master":"10|eosc________::ba42c5e4332ff16c6cd28573012bc2f9"}
{"datasource":"eosc________::inaf::inaf.space-vis_vialactea_service","master":"10|eosc________::ce2ca563bceae686b763326ed53e7b54"}
{"datasource":"eosc________::infn::infn.dynamic_on_demand_analysis_service","master":"10|eosc________::f884894e05c5a54646f0b5715e5495d6"}
{"datasource":"eosc________::infn::infn.fgsg_science_software_on_demand","master":"10|eosc________::452af4e76a64b6ee7e4bdc86527687f7"}
{"datasource":"eosc________::infn::infn.indigo_identity_and_access_management","master":"10|eosc________::d23115c40a4e256725f140330d001861"}
{"datasource":"eosc________::infn::infn.infn-cloud_object_storage_dice","master":"10|eosc________::fe0c28e8657cb84e3b775156106c03d1"}
{"datasource":"eosc________::infn::infn.paas_orchestrator","master":"10|eosc________::146240bb16057a93e11631edee570f76"}
{"datasource":"eosc________::infrafrontier::infrafrontier.training_in_mouse_functional_genomics","master":"10|eosc________::64d6597d10f4e617152f4a612a87eaba"}
{"datasource":"eosc________::inria::inria.software_heritage_archive","master":"10|fairsharing_::2c758933af02c0b301906f2819ae1268"}
{"datasource":"eosc________::jelastic::jelastic.platform-as-a-service","master":"10|eosc________::bfcae4ab00df41a3c43efbb879586e8f"}
{"datasource":"eosc________::kit::kit.eosc-performance","master":"10|eosc________::e52ab75587c1dd98db80568197f04586"}
{"datasource":"eosc________::kit::kit.o3as_ozone_assessment","master":"10|eosc________::aaf27a5f35a790617247abecd84b100f"}
{"datasource":"eosc________::komanord::komanord.guardomic","master":"10|eosc________::b1e06c9d2c472e9441ee72e83a934d40"}
{"datasource":"eosc________::lago::lago.onedatasim","master":"10|eosc________::2b2163e8b82320fed69a017a3e5fb657"}
{"datasource":"eosc________::lifewatch-eric::lifewatch-eric.plants_identification_app","master":"10|eosc________::6fc6ed0894391496d3c4967d45933d1a"}
{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.elixirfm","master":"10|eosc________::6dd7c323776a028cef0619cb34bdf48c"}
{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.ker_-_keyword_extractor","master":"10|eosc________::09915f038900aa43cb0c76aa89f10cda"}
{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.lindatclariah-cz_repository","master":"10|eosc________::3daee6a29fb1d9a0f624cdd5973c33ea"}
{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.machine_translation","master":"10|eosc________::3ae4551729381cfd03c433fb0de0c971"}
{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.morphodita","master":"10|eosc________::f2ceebdc1a41d65504ff27f7297c833b"}
{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.nametag","master":"10|eosc________::71e3226e7a868e2215335ffb29073285"}
{"datasource":"eosc________::lindatclariah-cz::lindatclariah-cz.udpipe_tool_for_lemmatization_morphological_analysis_pos_tagging_and_dependency_parsing_in_multiple_languages","master":"10|eosc________::2dfc64c2951d9be3f1e2b576633ea425"}
{"datasource":"eosc________::lnec::lnec-pt.opencoasts_portal","master":"10|eosc________::7e99655aeda0b5f06efb3eea424dff54"}
{"datasource":"eosc________::lnec::lnec.worsica_-_water_monitoring_sentinel_cloud_platform","master":"10|eosc________::c2f55ab774c3cbbd9a330eebaa74dc36"}
{"datasource":"eosc________::materialscloud::materialscloud.aiiDA_lab","master":"10|eosc________::dfd970a812cf2e0298eb28c681bc109f"}
{"datasource":"eosc________::materialscloud::materialscloud.materials_cloud_archive","master":"10|fairsharing_::a431d70133ef6cf688bc4f6093922b48"}
{"datasource":"eosc________::meeo::meeo.adam_platform","master":"10|eosc________::b17fedb87dd9985b6a5e51db593446d6"}
{"datasource":"eosc________::meeo::meeo.adam_space","master":"10|eosc________::24bfbca4cf4fedc5a4a662fe67a30d7e"}
{"datasource":"eosc________::mobile_observation_integration_service::mobile_observation_integration_service.dark_sky_meter_datasource","master":"10|eosc________::160638e73224aeb7e4f98fd237672919"}
{"datasource":"eosc________::msw::msw.polaris_os","master":"10|eosc________::12348ba5b2c5902fd400cb3f1ab773ee"}
{"datasource":"eosc________::obp::obp.thoth","master":"10|eosc________::680198ec3f51a744de8a7603d542a0e1"}
{"datasource":"eosc________::openaire::openaire.amnesia","master":"10|eosc________::ac57e2dd5b3ee01909d7a592523bb96f"}
{"datasource":"eosc________::openaire::openaire.argos","master":"10|eosc________::92145beb3257af0510ee61ef10d16870"}
{"datasource":"eosc________::openaire::openaire.broker","master":"10|eosc________::c8c6e8d211d6df4ee8a187fa1134bd92"}
{"datasource":"eosc________::openaire::openaire.data_provider_dashboard","master":"10|eosc________::809d4c77a7acf9ac0cc2990d4264ae51"}
{"datasource":"eosc________::openaire::openaire.digital_humanities_and_cultural_heritage_openaire_community_gateway","master":"10|eosc________::b9110e9735dd467abc969fe8e2f1efa3"}
{"datasource":"eosc________::openaire::openaire.discovery_portal","master":"10|eosc________::992052173b689c8cea94e8e8d99f0238"}
{"datasource":"eosc________::openaire::openaire.european_marine_science_openaire_dashboard","master":"10|eosc________::950a99851df85c90ec2e933e1d55e164"}
{"datasource":"eosc________::openaire::openaire.funder_dashboard","master":"10|eosc________::196eea80ab9d73766cd2e8b6ab85872f"}
{"datasource":"eosc________::openaire::openaire.graph","master":"10|eosc________::c122caed52a88b57732b814a74141000"}
{"datasource":"eosc________::openaire::openaire.greek_sustainable_development_solutions_network_sdsn_openaire_dashboard","master":"10|eosc________::8100e41e3a5b18170bc5ede2cc393331"}
{"datasource":"eosc________::openaire::openaire.inference","master":"10|eosc________::c491811e9a6afa69cdcab0f92fca6f7b"}
{"datasource":"eosc________::openaire::openaire.neuroinformatics_openaire_dashboard","master":"10|eosc________::6e3adcce4d0d4229a9749584dfd5e7a8"}
{"datasource":"eosc________::openaire::openaire.open_science_helpdesk","master":"10|eosc________::d66db88d4c6c354fe7ebcd4c3dce334e"}
{"datasource":"eosc________::openaire::openaire.open_science_observatory","master":"10|eosc________::441ee64860eb79808b7cf0bb08262be6"}
{"datasource":"eosc________::openaire::openaire.open_science_training","master":"10|eosc________::99847506cdff50afa4945d60a9661ea3"}
{"datasource":"eosc________::openaire::openaire.openaire_login","master":"10|eosc________::818973a9375c0fa545499e1bb9ad0ab2"}
{"datasource":"eosc________::openaire::openaire.openapc","master":"10|eosc________::a28cc193bc938573e892b8aad0017702"}
{"datasource":"eosc________::openaire::openaire.research_community_dashboard","master":"10|eosc________::e1a866322f76407fb161a253dc5b539c"}
{"datasource":"eosc________::openaire::openaire.scholexplorer","master":"10|eosc________::6b34adede04121175566ef8c70f1e520"}
{"datasource":"eosc________::openaire::openaire.technical_support_towards_openaire_compliance","master":"10|eosc________::cdb8e94b386f9b6780a47194bd1bc7f7"}
{"datasource":"eosc________::openaire::openaire.topos_observatory_for_organisations","master":"10|eosc________::a7d2b95257273b5ea3f3a23fd8a60d48"}
{"datasource":"eosc________::openaire::openaire.usage_statistics","master":"10|eosc________::8aa345dc7321fc97906bf4c193a05a8f"}
{"datasource":"eosc________::openaire::openaire.validator","master":"10|eosc________::f2c13efbaa2a33af3e4e6a54805ac379"}
{"datasource":"eosc________::openaire::openaire.zenodo","master":"10|opendoar____::358aee4cc897452c00244351e4d91f69"}
{"datasource":"eosc________::openbiomaps::openbiomaps.openbiomaps","master":"10|eosc________::32edf5a4edbdea0899d6ba588d083efd"}
{"datasource":"eosc________::openedition::openedition.operas_research_for_society","master":"10|eosc________::2cdf4f57007b990b7ad7a884796f9b15"}
{"datasource":"eosc________::openknowledgemaps::openknowledgemaps.open_knowledge_maps","master":"10|eosc________::f3819d0f8e8bf57d383b23d31a3c0099"}
{"datasource":"eosc________::openminted::openminted.builder_of_tdm_applications","master":"10|eosc________::fdd26c19dd490260bc6c48b5813f4ac3"}
{"datasource":"eosc________::openminted::openminted.catalogue_of_ancillary_resources","master":"10|eosc________::ab4e37e85a1975b204b66683ed3888a8"}
{"datasource":"eosc________::openminted::openminted.catalogue_of_corpora","master":"10|eosc________::2cf744a594ea30fd31e976bffa8f2b71"}
{"datasource":"eosc________::openminted::openminted.catalogue_of_tdm_applications","master":"10|eosc________::ef5f343c5cf11fa2d40407ec308bb34a"}
{"datasource":"eosc________::openminted::openminted.catalogue_of_tdm_components","master":"10|eosc________::4275243a94677f19a5b74e5afb1f94cf"}
{"datasource":"eosc________::openminted::openminted.consulting_on_licences_for_tdm","master":"10|eosc________::522000b4c90b209aa7be961449ca910f"}
{"datasource":"eosc________::openminted::openminted.corpus_builder_for_scholarly_works","master":"10|eosc________::c64725d47af63bc2114b4214b684a392"}
{"datasource":"eosc________::openminted::openminted.support_and_training","master":"10|eosc________::84501ff99e5e429f5f083ab8ca0be7e4"}
{"datasource":"eosc________::openminted::openminted.tdm_applications_executor","master":"10|eosc________::e9ae655ce2ff1eaa19d0b3475ce5e660"}
{"datasource":"eosc________::operas::operas.gotriple_discovery_platform","master":"10|eosc________::f687e24dc56aaeeb561c95865a5071cc"}
{"datasource":"eosc________::operas::operas.operas_metrics_service","master":"10|eosc________::5960e1289f623625210f720c6173592d"}
{"datasource":"eosc________::oslo_university::oslo_university.services_for_sensitive_data_tsd","master":"10|eosc________::743b01351510f88e24be1c700c581f68"}
{"datasource":"eosc________::osmooc::osmooc.open_science_mooc","master":"10|eosc________::e101101e8653b6607a3ad9fea3b7d1fe"}
{"datasource":"eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing","master":"10|openaire____::bf5a61cc330e21ffa90eed3eb1533466"}
{"datasource":"eosc________::phenomenal::phenomenal.phenomenal","master":"10|eosc________::79e19b14aeee0d94e9a79110a6e6ad32"}
{"datasource":"eosc________::plantnet::plantnet.plntnet_identification_service","master":"10|eosc________::5ce89743eafdd8578591d84150f547e4"}
{"datasource":"eosc________::prace::prace.application_enabling_support","master":"10|eosc________::c87fd74ed685337fdbcff504373fc513"}
{"datasource":"eosc________::prace::prace.code_vault","master":"10|eosc________::dbab7889c81b59ec753040a762f6569a"}
{"datasource":"eosc________::prace::prace.deci_access","master":"10|eosc________::c7cedb82b1beea5382601d48807212aa"}
{"datasource":"eosc________::prace::prace.mooc","master":"10|eosc________::d6ff8167d31dccebe33a272513422b53"}
{"datasource":"eosc________::prace::prace.patc","master":"10|eosc________::1ab1b123bd559ee7f7c7ec2ee353f0c0"}
{"datasource":"eosc________::prace::prace.preparatory_access","master":"10|eosc________::39430adf529f1ab9e33da444b3708fcf"}
{"datasource":"eosc________::prace::prace.project_access","master":"10|eosc________::b58e957946983b686c76ee19dfab8d70"}
{"datasource":"eosc________::prace::prace.ptc","master":"10|eosc________::b3ca18e8884bfe2422d3723313fef79c"}
{"datasource":"eosc________::prace::prace.seasonal_schools_and_international_summer_school","master":"10|eosc________::590c71318d9d94c32981e3195567d546"}
{"datasource":"eosc________::prace::prace.shape","master":"10|eosc________::38b5a26f74e4808270a2d4f305d2f3a5"}
{"datasource":"eosc________::prace::prace.training_portal","master":"10|eosc________::25966a269ab2343ac9c4d982c341d87f"}
{"datasource":"eosc________::predictia::predictia.climadjust","master":"10|eosc________::14743eb22da3524893784faf409aac70"}
{"datasource":"eosc________::psi::psi.psi_public_data_repository","master":"10|re3data_____::1e55174ff77ed2d804871281201dbb50"}
{"datasource":"eosc________::psi::psi.remote_desktop_service","master":"10|eosc________::c82e26eb6e65d008de03b349dffc11fc"}
{"datasource":"eosc________::psnc::psnc.rohub","master":"10|eosc________::c87f08707b5235172e85b374e39a82dc"}
{"datasource":"eosc________::psnc::psnc.symbiote","master":"10|eosc________::ef0cd965a0d0a3df80ecfae4b3b08aad"}
{"datasource":"eosc________::rasdaman::rasdaman.datacube","master":"10|eosc________::bb1678f7b15d8c15fde6e240a4f95f93"}
{"datasource":"eosc________::rbi::rbi.dariah_science_gateway","master":"10|eosc________::b51b448421d926293b3781f4ac90f4f4"}
{"datasource":"eosc________::readcoop::readcoop.transkribus","master":"10|eosc________::a80411026809e6eaa896439e1b9764f4"}
{"datasource":"eosc________::rli::rli.open_energy_platform","master":"10|fairsharing_::0cbed40c0d920b94126eaf5e707be1f5"}
{"datasource":"eosc________::ror-org::ror-org.identifier","master":"10|eosc________::6fe92c2346db22322ddf6b677d449b0e"}
{"datasource":"eosc________::sciences_po::sciences_po.ethnic_and_migrant_minority_survey_registry","master":"10|eosc________::0cde986dc2bf015912e407f0f83ee402"}
{"datasource":"eosc________::sciences_po::sciences_po.wpss_for_ess","master":"10|eosc________::9a5bb11c495443aad944b04f5fcb5c07"}
{"datasource":"eosc________::scigne::scigne.cloud_compute","master":"10|eosc________::7c63e3284c36b5977c553192dce506b3"}
{"datasource":"eosc________::scipedia::scipedia.scipedia","master":"10|eosc________::850abcddc76069f2c3c1cf77ad4beec9"}
{"datasource":"eosc________::scipedia::scipedia.topos_for_individuals","master":"10|eosc________::e6214b58f39a25b53eecda340f95ee7b"}
{"datasource":"eosc________::seadatanet::seadatanet.doi_minting_service","master":"10|eosc________::f87f72147a3c82c4f77684e40101e90e"}
{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_marine_environmental_data_edmed","master":"10|eosc________::d79706389f0b864306feb47aac1f5766"}
{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_marine_environmental_research_projects","master":"10|eosc________::baa9d2d6cdd8507fcbf76242e4c25d76"}
{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_marine_organisations_edmo","master":"10|eosc________::5d23c66c26e0df209fc415c1e9ad0316"}
{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_the_cruise_summary_reports_csr","master":"10|eosc________::fd70912c66037dc11f710587e281eeaf"}
{"datasource":"eosc________::seadatanet::seadatanet.european_directory_of_the_initial_ocean-observing_systems_edios","master":"10|eosc________::846016e987d1feaf2a36083f88dba1f2"}
{"datasource":"eosc________::seadatanet::seadatanet.seadatanet_cdi","master":"10|eosc________::36cd158d6b1bbdbfb443c68b8da00335"}
{"datasource":"eosc________::seadatanet::seadatanet.vocabulary_services_-_underpinned_by_the_nerc_vocabulary_server_nvs","master":"10|eosc________::4416d18ec7a57e553979fbfa4d862483"}
{"datasource":"eosc________::sinergise::sinergise.sentinel_hub","master":"10|eosc________::d36ae944fa207461bcb7b2b3a6c94de8"}
{"datasource":"eosc________::sixsq::sixsq.nuvla_multi-cloud_application_management_platform","master":"10|eosc________::38438cc3190a3815359efb53b9dd98eb"}
{"datasource":"eosc________::sks::sks.digital_production_for_conferences_workshops_roundtables_and_other_academic_and_professional_events","master":"10|eosc________::f6b51bef4a5f1478e980673339f2b2f3"}
{"datasource":"eosc________::smartsmear::smartsmear.smartsmear","master":"10|eosc________::d17a9325ca64ffad59e04659ed5404f7"}
{"datasource":"eosc________::sobigdata::sobigdata.tagme","master":"10|eosc________::0c3b8b80d9d6d38effd28bfa6a140a12"}
{"datasource":"eosc________::suite5::suite5.furniture_enterprise_analytics","master":"10|eosc________::29ed60070bd91bdc19c9f278b104465c"}
{"datasource":"eosc________::switch::switch.switchengines","master":"10|eosc________::d4143918a810115206640cfeb11e0ba6"}
{"datasource":"eosc________::t-systems::t-systems.open_telekom_cloud","master":"10|eosc________::c489ef6564a47922359f7b833919d642"}
{"datasource":"eosc________::terradue::terradue.eo_services_for_earthquake_response_and_landslides_analysis","master":"10|eosc________::ab3140d145deb5fdb02eeefbc5ebc471"}
{"datasource":"eosc________::tib::tib.open_research_knowledge_graph_orkg","master":"10|eosc________::ed6bd695c7a99297f360bc2fc915be90"}
{"datasource":"eosc________::ubora::ubora.ubora","master":"10|eosc________::bacf05aff1c6dcf3133a0352d5eb14c4"}
{"datasource":"eosc________::ubora::ubora.ubora_e-platform","master":"10|eosc________::947fde33605ba61216a07135ee1551f2"}
{"datasource":"eosc________::ugr-es::ugr-es.glacier_lagoons_of_sierra_nevada","master":"10|eosc________::8a966c0efca298ad5ec130d323c29935"}
{"datasource":"eosc________::uit::uit.dataverseno","master":"10|eosc________::92b76aa81a5b8443fcf17d3ae3c34211"}
{"datasource":"eosc________::uit::uit.the_troms_repository_of_language_and_linguistics_trolling","master":"10|fairsharing_::a36b0dcd1e6384abc0e1867860ad3ee3"}
{"datasource":"eosc________::ukaea::ukaea.prominence","master":"10|eosc________::06ce999c7cf77ea5a65f87bb563cd625"}
{"datasource":"eosc________::ukri_-_stfc::ukri_-_stfc.cvmfs_test","master":"10|eosc________::53aaa0a24d0edc47c23e722135c29dde"}
{"datasource":"eosc________::ukri_-_stfc::ukri_-_stfc.rucio","master":"10|eosc________::c19a8251c6bf563365c555572ace903e"}
{"datasource":"eosc________::uni-freiburg::uni-freiburg.european_galaxy_server","master":"10|eosc________::cc00fc2385475b80accec001dfb85efb"}
{"datasource":"eosc________::unibo::unibo.opencitations","master":"10|eosc________::573c29ecaf76ab961743bfc8a7d911ec"}
{"datasource":"eosc________::unifl::unifl.snap4city","master":"10|eosc________::9a55c40c3c082b7a8352ecbc56a87996"}
{"datasource":"eosc________::unige::unige.astronomical_online_data_analysis_astrooda","master":"10|eosc________::63f6119d3170cccf979daada3c5b524e"}
{"datasource":"eosc________::unitartu::unitartu.ut.rocket","master":"10|eosc________::da3450589a9d56212963b20cf729974c"}
{"datasource":"eosc________::upv-es::upv-es.lemonade","master":"10|eosc________::afdd227beada491f77d7944d7a0eafc9"}
{"datasource":"eosc________::vamdc::vamdc.portal","master":"10|eosc________::4dab2bb6e9a9ad223cd63c62c2ea804e"}
{"datasource":"eosc________::vamdc::vamdc.query_store","master":"10|eosc________::33f18bfe544c3c84ac28be6a3292d166"}
{"datasource":"eosc________::vamdc::vamdc.species_database","master":"10|eosc________::ae3587682dec5663a1b3b625036d15d0"}
{"datasource":"eosc________::vilnius-university::vilnius-university.the_national_open_access_research_data_archive_midas","master":"10|eosc________::4987ee0d071f68cf88f6b1a834b6733f"}
{"datasource":"eosc________::wenmr::wenmr.amber-based_portal_server_for_nmr_structures_amps-nmr","master":"10|eosc________::c6cca9747ef3ce296bd626bcbc4e480a"}
{"datasource":"eosc________::wenmr::wenmr.disvis_web_portal","master":"10|eosc________::2539ec693b683284c4e243b969ae3fc0"}
{"datasource":"eosc________::wenmr::wenmr.fanten_finding_anisotropy_tensor","master":"10|eosc________::99c793e3f3b856c48eaaa36682038b28"}
{"datasource":"eosc________::wenmr::wenmr.haddock24_web_portal","master":"10|eosc________::0f198f6a0885105809f420be23614be3"}
{"datasource":"eosc________::wenmr::wenmr.metalpdb","master":"10|eosc________::84676bc3d2ce17de70309dc58f428296"}
{"datasource":"eosc________::wenmr::wenmr.pdb-tools_web","master":"10|eosc________::b37eed45624ac30f3476f71640e59a61"}
{"datasource":"eosc________::wenmr::wenmr.powerfit_web_portal","master":"10|eosc________::93d4d621ed1da378c0e7dc891cefc007"}
{"datasource":"eosc________::wenmr::wenmr.spoton","master":"10|eosc________::76e7e0552f9c6b89db94b31ddc366b9f"}

View File

@ -275,10 +275,13 @@ public abstract class AbstractMdRecordToOafMapper {
res res
.add( .add(
getRelation( OafMapperUtils
.getRelation(
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate)); docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity, validationdDate));
res res
.add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate)); .add(
OafMapperUtils
.getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity, validationdDate));
} }
} }
@ -311,11 +314,14 @@ public abstract class AbstractMdRecordToOafMapper {
final String targetId = createOpenaireId(targetType, target, true); final String targetId = createOpenaireId(targetType, target, true);
rels rels
.add( .add(
getRelation( OafMapperUtils
entity.getId(), targetId, relType, subRelType, relClass, entity, validationdDate)); .getRelation(
entity.getId(), targetId, relType, subRelType, relClass, entity,
validationdDate));
rels rels
.add( .add(
getRelation( OafMapperUtils
.getRelation(
targetId, entity.getId(), relType, subRelType, relClassInverse, entity, targetId, entity.getId(), relType, subRelType, relClassInverse, entity,
validationdDate)); validationdDate));
} }
@ -325,36 +331,6 @@ public abstract class AbstractMdRecordToOafMapper {
return rels; return rels;
} }
protected Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null);
}
protected Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity,
final String validationDate) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
rel.setCollectedfrom(entity.getCollectedfrom());
rel.setDataInfo(entity.getDataInfo());
rel.setLastupdatetimestamp(entity.getLastupdatetimestamp());
rel.setValidated(StringUtils.isNotBlank(validationDate));
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
return rel;
}
protected abstract List<Oaf> addOtherResultRels( protected abstract List<Oaf> addOtherResultRels(
final Document doc, final Document doc,
final OafEntity entity); final OafEntity entity);

View File

@ -1,32 +1,7 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_MERGED_IN;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.MERGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
import java.io.Closeable; import java.io.Closeable;
@ -45,6 +20,8 @@ import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
@ -68,6 +45,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
@ -437,25 +415,14 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final List<KeyValue> collectedFrom = listKeyValues( final List<KeyValue> collectedFrom = listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
final Relation r1 = new Relation(); final Relation r1 = OafMapperUtils
r1.setRelType(DATASOURCE_ORGANIZATION); .getRelation(
r1.setSubRelType(PROVISION); dsId, orgId, DATASOURCE_ORGANIZATION, PRODUCES, IS_PROVIDED_BY, collectedFrom, info,
r1.setRelClass(IS_PROVIDED_BY); lastUpdateTimestamp);
r1.setSource(dsId);
r1.setTarget(orgId);
r1.setCollectedfrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
final Relation r2 = new Relation(); final Relation r2 = OafMapperUtils
r2.setRelType(DATASOURCE_ORGANIZATION); .getRelation(
r2.setSubRelType(PROVISION); orgId, dsId, DATASOURCE_ORGANIZATION, PRODUCES, PROVIDES, collectedFrom, info, lastUpdateTimestamp);
r2.setRelClass(PROVIDES);
r2.setSource(orgId);
r2.setTarget(dsId);
r2.setCollectedfrom(collectedFrom);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(r1, r2); return Arrays.asList(r1, r2);
} catch (final Exception e) { } catch (final Exception e) {
@ -471,25 +438,20 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final List<KeyValue> collectedFrom = listKeyValues( final List<KeyValue> collectedFrom = listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
final Relation r1 = new Relation(); final List<KeyValue> properties = Lists
r1.setRelType(PROJECT_ORGANIZATION); .newArrayList(
r1.setSubRelType(PARTICIPATION); keyValue("contribution", String.valueOf(rs.getDouble("contribution"))),
r1.setRelClass(HAS_PARTICIPANT); keyValue("currency", rs.getString("currency")));
r1.setSource(projectId);
r1.setTarget(orgId);
r1.setCollectedfrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
final Relation r2 = new Relation(); final Relation r1 = OafMapperUtils
r2.setRelType(PROJECT_ORGANIZATION); .getRelation(
r2.setSubRelType(PARTICIPATION); projectId, orgId, PROJECT_ORGANIZATION, PARTICIPATION, HAS_PARTICIPANT, collectedFrom, info,
r2.setRelClass(IS_PARTICIPANT); lastUpdateTimestamp, null, properties);
r2.setSource(orgId);
r2.setTarget(projectId); final Relation r2 = OafMapperUtils
r2.setCollectedfrom(collectedFrom); .getRelation(
r2.setDataInfo(info); orgId, projectId, PROJECT_ORGANIZATION, PARTICIPATION, IS_PARTICIPANT, collectedFrom, info,
r2.setLastupdatetimestamp(lastUpdateTimestamp); lastUpdateTimestamp, null, properties);
return Arrays.asList(r1, r2); return Arrays.asList(r1, r2);
} catch (final Exception e) { } catch (final Exception e) {
@ -703,25 +665,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final List<KeyValue> collectedFrom = listKeyValues( final List<KeyValue> collectedFrom = listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
final Relation r1 = new Relation(); final Relation r1 = OafMapperUtils
r1.setRelType(ORG_ORG_RELTYPE); .getRelation(orgId1, orgId2, ORG_ORG_RELTYPE, DEDUP, MERGES, collectedFrom, info, lastUpdateTimestamp);
r1.setSubRelType(ModelConstants.DEDUP);
r1.setRelClass(MERGES);
r1.setSource(orgId1);
r1.setTarget(orgId2);
r1.setCollectedfrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
final Relation r2 = new Relation(); final Relation r2 = OafMapperUtils
r2.setRelType(ORG_ORG_RELTYPE); .getRelation(
r2.setSubRelType(ModelConstants.DEDUP); orgId2, orgId1, ORG_ORG_RELTYPE, DEDUP, IS_MERGED_IN, collectedFrom, info, lastUpdateTimestamp);
r2.setRelClass(IS_MERGED_IN);
r2.setSource(orgId2);
r2.setTarget(orgId1);
r2.setCollectedfrom(collectedFrom);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(r1, r2); return Arrays.asList(r1, r2);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
@ -738,17 +687,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final List<KeyValue> collectedFrom = listKeyValues( final List<KeyValue> collectedFrom = listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
final Relation r = new Relation(); return Arrays
r.setRelType(ORG_ORG_RELTYPE); .asList(
r.setSubRelType(ModelConstants.RELATIONSHIP); OafMapperUtils
r.setRelClass(rs.getString("type")); .getRelation(
r.setSource(orgId1); orgId1, orgId2, ORG_ORG_RELTYPE, RELATIONSHIP, rs.getString("type"), collectedFrom, info,
r.setTarget(orgId2); lastUpdateTimestamp));
r.setCollectedfrom(collectedFrom);
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
return Arrays.asList(r);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -765,29 +709,12 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final List<KeyValue> collectedFrom = listKeyValues( final List<KeyValue> collectedFrom = listKeyValues(
createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"));
final Relation r1 = new Relation(); return Arrays
r1.setRelType(ORG_ORG_RELTYPE); .asList(
r1.setSubRelType(ModelConstants.DEDUP); OafMapperUtils
r1.setRelClass(relClass); .getRelation(
r1.setSource(orgId1); orgId1, orgId2, ORG_ORG_RELTYPE, DEDUP, relClass, collectedFrom, info,
r1.setTarget(orgId2); lastUpdateTimestamp));
r1.setCollectedfrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
// removed because there's no difference between two sides //TODO
// final Relation r2 = new Relation();
// r2.setRelType(ORG_ORG_RELTYPE);
// r2.setSubRelType(ORG_ORG_SUBRELTYPE);
// r2.setRelClass(relClass);
// r2.setSource(orgId2);
// r2.setTarget(orgId1);
// r2.setCollectedfrom(collectedFrom);
// r2.setDataInfo(info);
// r2.setLastupdatetimestamp(lastUpdateTimestamp);
// return Arrays.asList(r1, r2);
return Arrays.asList(r1);
} catch (final Exception e) { } catch (final Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }

View File

@ -3,6 +3,7 @@ SELECT
po.resporganization AS resporganization, po.resporganization AS resporganization,
po.participantnumber AS participantnumber, po.participantnumber AS participantnumber,
po.contribution AS contribution, po.contribution AS contribution,
po.currency AS currency,
NULL AS startdate, NULL AS startdate,
NULL AS enddate, NULL AS enddate,
false AS inferred, false AS inferred,

View File

@ -1,5 +1,6 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true} {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true},
{"paramName":"r", "paramLongName":"filterRelation", "paramDescription": "the relation to filter", "paramRequired": false}
] ]

View File

@ -3,5 +3,7 @@
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"su", "paramLongName":"scholixUpdatePath", "paramDescription": "the scholix updated Path", "paramRequired": false}, {"paramName":"su", "paramLongName":"scholixUpdatePath", "paramDescription": "the scholix updated Path", "paramRequired": false},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}, {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true},
{"paramName":"o", "paramLongName":"objectType", "paramDescription": "should be scholix or Summary", "paramRequired": true} {"paramName":"o", "paramLongName":"objectType", "paramDescription": "should be scholix or Summary", "paramRequired": true},
{"paramName":"mp", "paramLongName":"maxPidNumberFilter", "paramDescription": "filter max number of pids in source/target", "paramRequired": false}
] ]

View File

@ -0,0 +1,10 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,145 @@
<workflow-app name="Create Scholix Dump" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the working dir base path</description>
</property>
<property>
<name>targetPath</name>
<description>the final graph path</description>
</property>
<property>
<name>relationFilter</name>
<description>Filter relation semantic</description>
</property>
<property>
<name>maxNumberOfPid</name>
<description>filter relation with at least #maxNumberOfPid</description>
</property>
</parameters>
<start to="ImportDatasetEntities"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ImportDatasetEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Import JSONRDD to Dataset kryo</name>
<class>eu.dnetlib.dhp.sx.graph.SparkConvertRDDtoDataset</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=3000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--filterRelation</arg><arg>${relationFilter}</arg>
</spark>
<ok to="CreateSummaries"/>
<error to="Kill"/>
</action>
<action name="CreateSummaries">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Convert Entities to summaries</name>
<class>eu.dnetlib.dhp.sx.graph.SparkCreateSummaryObject</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=20000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${targetPath}/entities</arg>
<arg>--targetPath</arg><arg>${targetPath}/provision/summaries</arg>
</spark>
<ok to="CreateScholix"/>
<error to="Kill"/>
</action>
<action name="CreateScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Generate Scholix Dataset</name>
<class>eu.dnetlib.dhp.sx.graph.SparkCreateScholix</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=30000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--summaryPath</arg><arg>${targetPath}/provision/summaries</arg>
<arg>--targetPath</arg><arg>${targetPath}/provision/scholix</arg>
<arg>--relationPath</arg><arg>${targetPath}/relation</arg>
</spark>
<ok to="DropJSONPath"/>
<error to="Kill"/>
</action>
<action name="DropJSONPath">
<fs>
<delete path='${targetPath}/json'/>
<mkdir path='${targetPath}/json/'/>
</fs>
<ok to="SerializeScholix"/>
<error to="Kill"/>
</action>
<action name="SerializeScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Serialize scholix to JSON</name>
<class>eu.dnetlib.dhp.sx.graph.SparkConvertObjectToJson</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=6000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--sourcePath</arg><arg>${targetPath}/provision/scholix/scholix</arg>
<arg>--targetPath</arg><arg>${targetPath}/json/scholix_json</arg>
<arg>--objectType</arg><arg>scholix</arg>
<arg>--maxPidNumberFilter</arg><arg>maxNumberOfPid</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -4,6 +4,7 @@ import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.SparkConvertObjectToJson.toInt
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
@ -12,6 +13,14 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkConvertObjectToJson { object SparkConvertObjectToJson {
def toInt(s: String): Option[Int] = {
try {
Some(s.toInt)
} catch {
case e: Exception => None
}
}
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
@ -37,6 +46,8 @@ object SparkConvertObjectToJson {
log.info(s"objectType -> $objectType") log.info(s"objectType -> $objectType")
val scholixUpdatePath = parser.get("scholixUpdatePath") val scholixUpdatePath = parser.get("scholixUpdatePath")
log.info(s"scholixUpdatePath -> $scholixUpdatePath") log.info(s"scholixUpdatePath -> $scholixUpdatePath")
val maxPidNumberFilter = parser.get("maxPidNumberFilter")
log.info(s"maxPidNumberFilter -> $maxPidNumberFilter")
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
@ -47,12 +58,22 @@ object SparkConvertObjectToJson {
case "scholix" => case "scholix" =>
log.info("Serialize Scholix") log.info("Serialize Scholix")
val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix] val d: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix] // val u: Dataset[Scholix] = spark.read.load(s"$scholixUpdatePath/scholix").as[Scholix]
d.union(u) if (maxPidNumberFilter != null && toInt(maxPidNumberFilter).isDefined) {
val mp = toInt(maxPidNumberFilter).get
d
.filter(s => (s.getSource.getIdentifier.size() <= mp) && (s.getTarget.getIdentifier.size() <= mp))
.map(s => mapper.writeValueAsString(s))(Encoders.STRING)
.rdd
.saveAsTextFile(targetPath, classOf[GzipCodec])
} else {
d
.repartition(8000) .repartition(8000)
.map(s => mapper.writeValueAsString(s))(Encoders.STRING) .map(s => mapper.writeValueAsString(s))(Encoders.STRING)
.rdd .rdd
.saveAsTextFile(targetPath, classOf[GzipCodec]) .saveAsTextFile(targetPath, classOf[GzipCodec])
}
case "summary" => case "summary" =>
log.info("Serialize Summary") log.info("Serialize Summary")
val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary] val d: Dataset[ScholixSummary] = spark.read.load(sourcePath).as[ScholixSummary]

View File

@ -4,9 +4,11 @@ import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
object SparkConvertRDDtoDataset { object SparkConvertRDDtoDataset {
@ -34,6 +36,9 @@ object SparkConvertRDDtoDataset {
val t = parser.get("targetPath") val t = parser.get("targetPath")
log.info(s"targetPath -> $t") log.info(s"targetPath -> $t")
val filterRelation = parser.get("filterRelation")
log.info(s"filterRelation -> $filterRelation")
val entityPath = s"$t/entities" val entityPath = s"$t/entities"
val relPath = s"$t/relation" val relPath = s"$t/relation"
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@ -94,9 +99,24 @@ object SparkConvertRDDtoDataset {
log.info("Converting Relation") log.info("Converting Relation")
if (filterRelation != null && StringUtils.isNoneBlank(filterRelation)) {
val rddRelation = spark.sparkContext
.textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && r.getDataInfo.getDeletedbyinference == false)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
//filter OpenCitations relations
.filter(r =>
r.getCollectedfrom != null && r.getCollectedfrom.size() > 0 && !r.getCollectedfrom.asScala.exists(k =>
"opencitations".equalsIgnoreCase(k.getValue)
)
)
.filter(r => r.getSubRelType != null && r.getSubRelType.equalsIgnoreCase(filterRelation))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
} else {
val relationSemanticFilter = List( val relationSemanticFilter = List(
// "cites",
// "iscitedby",
"merges", "merges",
"ismergedin", "ismergedin",
"HasAmongTopNSimilarDocuments", "HasAmongTopNSimilarDocuments",
@ -116,6 +136,7 @@ object SparkConvertRDDtoDataset {
) )
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
}
} }
} }

View File

@ -1,9 +1,7 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
@ -32,11 +30,12 @@ import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
public class MigrateDbEntitiesApplicationTest { class MigrateDbEntitiesApplicationTest {
private MigrateDbEntitiesApplication app; private MigrateDbEntitiesApplication app;
@ -62,7 +61,7 @@ public class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
public void testProcessService() throws Exception { void testProcessService() throws Exception {
final List<TypedField> fields = prepareMocks("services_resultset_entry.json"); final List<TypedField> fields = prepareMocks("services_resultset_entry.json");
final List<Oaf> list = app.processService(rs); final List<Oaf> list = app.processService(rs);
@ -75,7 +74,7 @@ public class MigrateDbEntitiesApplicationTest {
.getCollectedfrom() .getCollectedfrom()
.stream() .stream()
.map(KeyValue::getKey) .map(KeyValue::getKey)
.forEach(dsId -> assertValidId(dsId)); .forEach(this::assertValidId);
assertEquals(1, ds.getPid().size()); assertEquals(1, ds.getPid().size());
assertEquals("r3d100010218", ds.getPid().get(0).getValue()); assertEquals("r3d100010218", ds.getPid().get(0).getValue());
@ -164,14 +163,14 @@ public class MigrateDbEntitiesApplicationTest {
.stream() .stream()
.map(Qualifier::getSchemeid) .map(Qualifier::getSchemeid)
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
assertTrue(cpSchemeId.size() == 1); assertEquals(1, cpSchemeId.size());
assertTrue(cpSchemeId.contains("eosc:contentpolicies")); assertTrue(cpSchemeId.contains("eosc:contentpolicies"));
HashSet<String> cpSchemeName = ds HashSet<String> cpSchemeName = ds
.getContentpolicies() .getContentpolicies()
.stream() .stream()
.map(Qualifier::getSchemename) .map(Qualifier::getSchemename)
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
assertTrue(cpSchemeName.size() == 1); assertEquals(1, cpSchemeName.size());
assertTrue(cpSchemeName.contains("eosc:contentpolicies")); assertTrue(cpSchemeName.contains("eosc:contentpolicies"));
assertEquals(2, ds.getContentpolicies().size()); assertEquals(2, ds.getContentpolicies().size());
assertEquals("Taxonomic classification", ds.getContentpolicies().get(0).getClassid()); assertEquals("Taxonomic classification", ds.getContentpolicies().get(0).getClassid());
@ -194,7 +193,7 @@ public class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
public void testProcessProject() throws Exception { void testProcessProject() throws Exception {
final List<TypedField> fields = prepareMocks("projects_resultset_entry.json"); final List<TypedField> fields = prepareMocks("projects_resultset_entry.json");
final List<Oaf> list = app.processProject(rs); final List<Oaf> list = app.processProject(rs);
@ -212,7 +211,7 @@ public class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
public void testProcessOrganization() throws Exception { void testProcessOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json"); final List<TypedField> fields = prepareMocks("organizations_resultset_entry.json");
final List<Oaf> list = app.processOrganization(rs); final List<Oaf> list = app.processOrganization(rs);
@ -239,7 +238,7 @@ public class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
public void testProcessDatasourceOrganization() throws Exception { void testProcessDatasourceOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json"); final List<TypedField> fields = prepareMocks("datasourceorganization_resultset_entry.json");
final List<Oaf> list = app.processServiceOrganization(rs); final List<Oaf> list = app.processServiceOrganization(rs);
@ -268,7 +267,7 @@ public class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
public void testProcessProjectOrganization() throws Exception { void testProcessProjectOrganization() throws Exception {
final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json"); final List<TypedField> fields = prepareMocks("projectorganization_resultset_entry.json");
final List<Oaf> list = app.processProjectOrganization(rs); final List<Oaf> list = app.processProjectOrganization(rs);
@ -284,6 +283,38 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals(r2.getSource(), r1.getTarget()); assertEquals(r2.getSource(), r1.getTarget());
assertValidId(r1.getCollectedfrom().get(0).getKey()); assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey()); assertValidId(r2.getCollectedfrom().get(0).getKey());
assertEquals(ModelConstants.PROJECT_ORGANIZATION, r1.getRelType());
assertEquals(ModelConstants.PROJECT_ORGANIZATION, r2.getRelType());
assertEquals(ModelConstants.PARTICIPATION, r1.getSubRelType());
assertEquals(ModelConstants.PARTICIPATION, r2.getSubRelType());
if (r1.getSource().startsWith("40")) {
assertEquals(ModelConstants.HAS_PARTICIPANT, r1.getRelClass());
assertEquals(ModelConstants.IS_PARTICIPANT, r2.getRelClass());
} else if (r1.getSource().startsWith("20")) {
assertEquals(ModelConstants.IS_PARTICIPANT, r1.getRelClass());
assertEquals(ModelConstants.HAS_PARTICIPANT, r2.getRelClass());
}
assertNotNull(r1.getProperties());
checkProperty(r1, "contribution", "436754.0");
checkProperty(r2, "contribution", "436754.0");
checkProperty(r1, "currency", "EUR");
checkProperty(r2, "currency", "EUR");
}
private void checkProperty(Relation r, String property, String value) {
final List<KeyValue> p = r
.getProperties()
.stream()
.filter(kv -> kv.getKey().equals(property))
.collect(Collectors.toList());
assertFalse(p.isEmpty());
assertEquals(1, p.size());
assertEquals(value, p.get(0).getValue());
} }
@Test @Test
@ -302,7 +333,7 @@ public class MigrateDbEntitiesApplicationTest {
} }
@Test @Test
public void testProcessClaims_rels() throws Exception { void testProcessClaims_rels() throws Exception {
final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json"); final List<TypedField> fields = prepareMocks("claimsrel_resultset_entry.json");
final List<Oaf> list = app.processClaims(rs); final List<Oaf> list = app.processClaims(rs);
@ -333,9 +364,6 @@ public class MigrateDbEntitiesApplicationTest {
assertValidId(r1.getCollectedfrom().get(0).getKey()); assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey()); assertValidId(r2.getCollectedfrom().get(0).getKey());
// System.out.println(new ObjectMapper().writeValueAsString(r1));
// System.out.println(new ObjectMapper().writeValueAsString(r2));
} }
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException { private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
@ -398,7 +426,7 @@ public class MigrateDbEntitiesApplicationTest {
final String[] values = ((List<?>) tf.getValue()) final String[] values = ((List<?>) tf.getValue())
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(o -> o.toString()) .map(Object::toString)
.toArray(String[]::new); .toArray(String[]::new);
Mockito.when(arr.getArray()).thenReturn(values); Mockito.when(arr.getArray()).thenReturn(values);

View File

@ -2,12 +2,12 @@
{ {
"field": "project", "field": "project",
"type": "string", "type": "string",
"value": "nsf_________::1700003" "value": "corda__h2020::824273"
}, },
{ {
"field": "resporganization", "field": "resporganization",
"type": "string", "type": "string",
"value": "nsf_________::University_of_Notre_Dame" "value": "corda__h2020::999945647"
}, },
{ {
"field": "participantnumber", "field": "participantnumber",
@ -16,8 +16,13 @@
}, },
{ {
"field": "contribution", "field": "contribution",
"type": "not_used", "type": "double",
"value": null "value": 436754
},
{
"field": "currency",
"type": "string",
"value": "EUR"
}, },
{ {
"field": "startdate", "field": "startdate",
@ -52,12 +57,12 @@
{ {
"field": "collectedfromid", "field": "collectedfromid",
"type": "string", "type": "string",
"value": "openaire____::nsf" "value": "openaire____::corda_h2020"
}, },
{ {
"field": "collectedfromname", "field": "collectedfromname",
"type": "string", "type": "string",
"value": "NSF - National Science Foundation" "value": "CORDA - COmmon Research DAta Warehouse - Horizon 2020"
}, },
{ {
"field": "semantics", "field": "semantics",

View File

@ -1,4 +1,4 @@
---- Sprint 1 ---- -- Sprint 1 ----
create table indi_pub_green_oa stored as parquet as create table indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select distinct p.id, coalesce(green_oa, 0) as green_oa
from publication p from publication p
@ -22,7 +22,8 @@ select p.id, 1 as grey_lit
from publication p from publication p
join result_classifications rt on rt.id = p.id join result_classifications rt on rt.id = p.id
where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and where rt.type not in ('Article','Part of book or chapter of book','Book','Doctoral thesis','Master thesis','Data Paper', 'Thesis', 'Bachelor thesis', 'Conference object') and
not exists (select 1 from result_classifications rc where type ='Other literature type' and rc.id=p.id)) tmp on p.id=tmp.id; not exists (select 1 from result_classifications rc where type ='Other literature type'
and rc.id=p.id)) tmp on p.id=tmp.id;
compute stats indi_pub_grey_lit; compute stats indi_pub_grey_lit;
@ -36,7 +37,8 @@ where pidtype='Digital Object Identifier' and d.name ='Crossref') tmp
on tmp.id=p.id; on tmp.id=p.id;
compute stats indi_pub_doi_from_crossref; compute stats indi_pub_doi_from_crossref;
---- Sprint 2 ----
-- Sprint 2 ----
create table indi_result_has_cc_licence stored as parquet as create table indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from result r from result r
@ -82,31 +84,61 @@ on r.id= tmp.id;
compute stats indi_funded_result_with_fundref; compute stats indi_funded_result_with_fundref;
-- create table indi_result_org_country_collab stored as parquet as create table indi_result_org_collab stored as parquet as
-- with tmp as select o1.organization org1, o2.organization org2, count(distinct o1.id) as collaborations
-- (select o.id as id, o.country , ro.id as result,r.type from organization o from result_organization as o1
-- join result_organization ro on o.id=ro.organization join result_organization as o2 on o1.id=o2.id and o1.organization!=o2.organization
-- join result r on r.id=ro.id where o.country <> 'UNKNOWN') group by o1.organization, o2.organization;
-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- from tmp as o1
-- join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id and o1.country<>o2.country
-- group by o1.id, o1.type,o2.country;
--
-- compute stats indi_result_org_country_collab;
-- create table indi_result_org_collab stored as parquet as compute stats indi_result_org_collab;
-- with tmp as
-- (select o.id, ro.id as result,r.type from organization o create table indi_result_org_country_collab stored as parquet as
-- join result_organization ro on o.id=ro.organization with tmp as
-- join result r on r.id=ro.id) (select o.id as id, o.country , ro.id as result,r.type from organization o
-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations join result_organization ro on o.id=ro.organization
-- from tmp as o1 join result r on r.id=ro.id where o.country <> 'UNKNOWN')
-- join tmp as o2 on o1.result=o2.result select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
-- where o1.id<>o2.id from tmp as o1
-- group by o1.id, o2.id, o1.type; join tmp as o2 on o1.result=o2.result
-- where o1.id<>o2.id and o1.country<>o2.country
-- compute stats indi_result_org_collab; group by o1.id, o1.type,o2.country;
compute stats indi_result_org_country_collab;
create table indi_result_org_collab stored as parquet as
with tmp as
(select o.id, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id)
select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1
join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id
group by o1.id, o2.id, o1.type;
compute stats indi_result_org_collab;
create table indi_project_collab_org stored as parquet as
select o1.id org1,o2.id org2, count(distinct o1.project) as collaborations
from organization_projects as o1
join organization_projects as o2 on o1.project=o2.project
where o1.id!=o2.id
group by o1.id, o2.id;
compute stats indi_project_collab_org;
create table indi_project_collab_org_country stored as parquet as
with tmp as
(select o.id organization, o.country , ro.project as project from organization o
join organization_projects ro on o.id=ro.id
and o.country <> 'UNKNOWN')
select o1.organization org1,o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1
join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country
group by o1.organization, o2.country;
compute stats indi_project_collab_org_country;
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op
@ -125,7 +157,7 @@ create table indi_result_country_collab stored as parquet as
with tmp as with tmp as
(select country, ro.id as result,r.type from organization o (select country, ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id) join result r on r.id=ro.id where country <> 'UNKNOWN')
select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations select o1.country country1, o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
@ -182,176 +214,178 @@ where d.type like '%Journal%' and ri.accessright='Closed Access' and
on tmp.id=ri.id; on tmp.id=ri.id;
compute stats indi_pub_closed_other_open; compute stats indi_pub_closed_other_open;
---- Sprint 5 ---- ---- Sprint 5 ----
create table indi_result_no_of_copies stored as parquet as create table indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies from result_instance group by id; select id, count(id) as number_of_copies from result_instance group by id;
compute stats indi_result_no_of_copies; compute stats indi_result_no_of_copies;
---- Sprint 6 ---- ---- Sprint 6 ----
create table indi_pub_gold_oa stored as parquet as --create table indi_pub_gold_oa stored as parquet as
WITH gold_oa AS ( --WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn -- SELECT issn_l, journal_is_in_doaj,journal_is_oa, issn_1 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_1 != "" -- WHERE issn_1 != ""
UNION ALL -- UNION ALL
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn -- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_2 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_2 != "" ), -- WHERE issn_2 != "" ),
issn AS ( --issn AS (
SELECT * FROM -- SELECT * FROM
(SELECT id, issn_printed as issn -- (SELECT id, issn_printed as issn
FROM datasource WHERE issn_printed IS NOT NULL -- FROM datasource WHERE issn_printed IS NOT NULL
UNION -- UNION
SELECT id, issn_online as issn -- SELECT id, issn_online as issn
FROM datasource WHERE issn_online IS NOT NULL) as issn -- FROM datasource WHERE issn_online IS NOT NULL) as issn
WHERE LENGTH(issn) > 7) -- WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold --SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM publication_datasources pd --FROM publication_datasources pd
LEFT OUTER JOIN ( --LEFT OUTER JOIN (
SELECT pd.id, 1 as is_gold FROM publication_datasources pd -- SELECT pd.id, 1 as is_gold FROM publication_datasources pd
JOIN issn on issn.id=pd.datasource -- JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; -- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
compute stats indi_pub_gold_oa; --compute stats indi_pub_gold_oa;
--
--create table indi_datasets_gold_oa stored as parquet as
--WITH gold_oa AS (
-- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_1 != ""
-- UNION
-- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
-- FROM stats_ext.oa_journals
-- WHERE issn_2 != "" ),
--issn AS (
-- SELECT *
-- FROM (
-- SELECT id,issn_printed as issn
-- FROM datasource
-- WHERE issn_printed IS NOT NULL
-- UNION
-- SELECT id, issn_online as issn
-- FROM datasource
-- WHERE issn_online IS NOT NULL ) as issn
-- WHERE LENGTH(issn) > 7)
--SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
--FROM dataset_datasources pd
--LEFT OUTER JOIN (
-- SELECT pd.id, 1 as is_gold FROM dataset_datasources pd
-- JOIN issn on issn.id=pd.datasource
-- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
--compute stats indi_datasets_gold_oa;
create table indi_datasets_gold_oa stored as parquet as --create table indi_software_gold_oa stored as parquet as
WITH gold_oa AS ( --WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn -- SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_1 != "" -- WHERE issn_1 != ""
UNION -- UNION
ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn -- ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
FROM stats_ext.oa_journals -- FROM stats_ext.oa_journals
WHERE issn_2 != "" ), -- WHERE issn_2 != "" ),
issn AS ( --issn AS (
SELECT * -- SELECT *
FROM ( -- FROM (
SELECT id,issn_printed as issn -- SELECT id,issn_printed as issn
FROM datasource -- FROM datasource
WHERE issn_printed IS NOT NULL -- WHERE issn_printed IS NOT NULL
UNION -- UNION
SELECT id, issn_online as issn -- SELECT id, issn_online as issn
FROM datasource -- FROM datasource
WHERE issn_online IS NOT NULL ) as issn -- WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7) -- WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold --SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM dataset_datasources pd --FROM software_datasources pd
LEFT OUTER JOIN ( --LEFT OUTER JOIN (
SELECT pd.id, 1 as is_gold FROM dataset_datasources pd -- SELECT pd.id, 1 as is_gold FROM software_datasources pd
JOIN issn on issn.id=pd.datasource -- JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id; -- JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
--
compute stats indi_datasets_gold_oa; --compute stats indi_software_gold_oa;
create table indi_software_gold_oa stored as parquet as
WITH gold_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_1 as issn
FROM stats_ext.oa_journals
WHERE issn_1 != ""
UNION
ALL SELECT issn_l,journal_is_in_doaj,journal_is_oa,issn_2 as issn
FROM stats_ext.oa_journals
WHERE issn_2 != "" ),
issn AS (
SELECT *
FROM (
SELECT id,issn_printed as issn
FROM datasource
WHERE issn_printed IS NOT NULL
UNION
SELECT id, issn_online as issn
FROM datasource
WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM software_datasources pd
LEFT OUTER JOIN (
SELECT pd.id, 1 as is_gold FROM software_datasources pd
JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp ON pd.id=tmp.id;
compute stats indi_software_gold_oa;
create table indi_org_findable stored as parquet as
with result_with_pid as (
select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
group by ro.organization),
result_has_abstract as (
select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro
join result rp on rp.id=ro.id where rp.abstract=true
group by ro.organization),
allresults as (
select organization, count(distinct id) no_allresults from result_organization
group by organization),
result_with_pid_share as (
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization),
result_with_abstract_share as (
select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share
from allresults
join result_has_abstract on result_has_abstract.organization=allresults.organization)
select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable
from allresults
join result_with_pid_share on result_with_pid_share.organization=allresults.organization
left outer join (
select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
compute stats indi_org_findable;
create table indi_org_openess stored as parquet as
WITH datasets_oa as (
SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.dataset ds on dg.id=ds.id
WHERE dg.is_gold=1
group by ro.organization),
software_oa as (
SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.software ds on dg.id=ds.id
WHERE dg.is_gold=1
group by ro.organization),
pubs_oa as (
SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg
join openaire_prod_stats.result_organization ro on dg.id=ro.id
join openaire_prod_stats.publication ds on dg.id=ds.id
where dg.is_gold=1
group by ro.organization),
allpubs as (
SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join openaire_prod_stats.publication ps on ps.id=ro.id
group by ro.organization),
alldatasets as (
SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join openaire_prod_stats.dataset ps on ps.id=ro.id
group by ro.organization),
allsoftware as (
SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join openaire_prod_stats.software ps on ps.id=ro.id
group by ro.organization),
allpubsshare as (
select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization),
alldatasetssshare as (
select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization),
allsoftwaresshare as (
select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization)
select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess
FROM allpubsshare
left outer join (
select organization,c from
alldatasetssshare) tmp on tmp.organization=allpubsshare.organization
left outer join (
select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
compute stats indi_org_openess;
--create table indi_org_findable stored as parquet as
--with result_with_pid as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
-- join result_pids rp on rp.id=ro.id
-- group by ro.organization),
--result_has_abstract as (
-- select ro.organization organization, count(distinct rp.id) no_result_with_abstract from result_organization ro
-- join result rp on rp.id=ro.id where rp.abstract=true
-- group by ro.organization),
--allresults as (
-- select organization, count(distinct id) no_allresults from result_organization
-- group by organization),
--result_with_pid_share as (
-- select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults pid_share
-- from allresults
-- join result_with_pid on result_with_pid.organization=allresults.organization),
--result_with_abstract_share as (
-- select allresults.organization, result_has_abstract.no_result_with_abstract/allresults.no_allresults abstract_share
-- from allresults
-- join result_has_abstract on result_has_abstract.organization=allresults.organization)
--select allresults.organization, coalesce((pid_share+abstract_share)/2,pid_share) org_findable
--from allresults
--join result_with_pid_share on result_with_pid_share.organization=allresults.organization
--left outer join (
-- select organization, abstract_share from result_with_abstract_share) tmp on tmp.organization=allresults.organization;
--
--compute stats indi_org_findable;
--
--create table indi_org_openess stored as parquet as
--WITH datasets_oa as (
-- SELECT ro.organization, count(dg.id) no_oadatasets FROM indi_datasets_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join dataset ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--software_oa as (
-- SELECT ro.organization, count(dg.id) no_oasoftware FROM indi_software_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join software ds on dg.id=ds.id
-- WHERE dg.is_gold=1
-- group by ro.organization),
--pubs_oa as (
-- SELECT ro.organization, count(dg.id) no_oapubs FROM indi_pub_gold_oa dg
-- join result_organization ro on dg.id=ro.id
-- join publication ds on dg.id=ds.id
-- where dg.is_gold=1
-- group by ro.organization),
--allpubs as (
-- SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
-- join publication ps on ps.id=ro.id
-- group by ro.organization),
--alldatasets as (
-- SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
-- join dataset ps on ps.id=ro.id
-- group by ro.organization),
--allsoftware as (
-- SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
-- join software ps on ps.id=ro.id
-- group by ro.organization),
--allpubsshare as (
-- select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
-- join pubs_oa on allpubs.organization=pubs_oa.organization),
--alldatasetssshare as (
-- select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets c
-- from alldatasets
-- join datasets_oa on alldatasets.organization=datasets_oa.organization),
--allsoftwaresshare as (
-- select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
-- from allsoftware
-- join software_oa on allsoftware.organization=software_oa.organization)
--select allpubsshare.organization, coalesce((c+p+s)/3, p) org_openess
--FROM allpubsshare
--left outer join (
-- select organization,c from
-- alldatasetssshare) tmp on tmp.organization=allpubsshare.organization
--left outer join (
-- select organization,s from allsoftwaresshare) tmp1 on tmp1.organization=allpubsshare.organization;
--
--compute stats indi_org_openess;
--
create table indi_pub_hybrid_oa_with_cc stored as parquet as create table indi_pub_hybrid_oa_with_cc stored as parquet as
WITH hybrid_oa AS ( WITH hybrid_oa AS (
SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn SELECT issn_l, journal_is_in_doaj, journal_is_oa, issn_print as issn
@ -418,3 +452,394 @@ GROUP BY result_id, repository_id, `year`
order by `year` asc, result_id; order by `year` asc, result_id;
compute stats indi_pub_downloads_datasource_year; compute stats indi_pub_downloads_datasource_year;
---- Sprint 7 ----
create table indi_pub_gold_oa stored as parquet as
WITH gold_oa AS ( SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_1 as issn
FROM
stats_ext.oa_journals
WHERE
issn_1 != ""
UNION
ALL SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_2 as issn
FROM
stats_ext.oa_journals
WHERE
issn_2 != "" ), issn AS ( SELECT
*
FROM
( SELECT
id,
issn_printed as issn
FROM
datasource
WHERE
issn_printed IS NOT NULL
UNION
SELECT
id,
issn_online as issn
FROM
datasource
WHERE
issn_online IS NOT NULL or id like '%doajarticles%') as issn
WHERE
LENGTH(issn) > 7)
SELECT
DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM
publication_datasources pd
left outer join(
select pd.id, 1 as is_gold FROM publication_datasources pd
JOIN issn on issn.id=pd.datasource
JOIN gold_oa on issn.issn = gold_oa.issn) tmp
on pd.id=tmp.id;
compute stats indi_pub_gold_oa;
create table indi_pub_hybrid stored as parquet as
WITH gold_oa AS ( SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_1 as issn,
has_apc
FROM
stats_ext.oa_journals
WHERE
issn_1 != ""
UNION
ALL SELECT
issn_l,
journal_is_in_doaj,
journal_is_oa,
issn_2 as issn,
has_apc
FROM
stats_ext.oa_journals
WHERE
issn_2 != "" ), issn AS ( SELECT
*
FROM
( SELECT
id,
issn_printed as issn
FROM
datasource
WHERE
issn_printed IS NOT NULL
UNION
SELECT
id,
issn_online as issn
FROM
datasource
WHERE
issn_online IS NOT NULL or id like '%doajarticles%') as issn
WHERE
LENGTH(issn) > 7)
select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid
from publication_datasources pd
left outer join (
select pd.id, 1 as is_hybrid from publication_datasources pd
join datasource d on d.id=pd.datasource
join issn on issn.id=pd.datasource
join gold_oa on issn.issn=gold_oa.issn
where (gold_oa.journal_is_in_doaj=false or gold_oa.journal_is_oa=false))tmp
on pd.id=tmp.id;
compute stats indi_pub_hybrid;
create table indi_org_fairness stored as parquet as
--return results with PIDs, and rich metadata group by organization
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id
--join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003
group by ro.organization),
--return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year>2003
group by organization)
--return results_fair/all_results
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness;
create table indi_org_fairness_pub_pr stored as parquet as
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro
join publication p on p.id=ro.id
join indi_pub_doi_from_crossref dc on dc.id=p.id
join indi_pub_grey_lit gl on gl.id=p.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null)
and (authors>0) and cast(year as int)>2003 and dc.doi_from_crossref=1 and gl.grey_lit=0
group by ro.organization),
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
where cast(year as int)>2003
group by organization)
--return results_fair/all_results
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub_pr;
create table indi_org_fairness_pub_year stored as parquet as
with result_fair as
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join publication p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003
group by ro.organization, year),
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
where cast(year as int)>2003
group by organization, year)
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_pub_year;
create table indi_org_fairness_pub as
with result_fair as
(select ro.organization organization, count(distinct ro.id) no_result_fair
from result_organization ro
join publication p on p.id=ro.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null)
and (authors>0) and cast(year as int)>2003
group by ro.organization),
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join publication p on p.id=ro.id
where cast(year as int)>2003
group by organization)
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization;
compute stats indi_org_fairness_pub;
create table indi_org_fairness_year stored as parquet as
with result_fair as
(select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro
join result r on r.id=ro.id
join result_pids rp on r.id=rp.id
where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003
group by ro.organization, year),
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year>2003
group by organization, year)
--return results_fair/all_results
select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year;
compute stats indi_org_fairness_year;
create table indi_org_findable_year stored as parquet as
--return results with PIDs group by organization,year
with result_with_pid as
(select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id
where year >2003
group by ro.organization, year),
--return all results group by organization,year
allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year >2003
group by organization, year)
--return results_with_pid/all_results
select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization and result_with_pid.year=allresults.year;
compute stats indi_org_findable_year;
create table indi_org_findable stored as parquet as
--return results with PIDs group by organization
with result_with_pid as
(select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro
join result_pids rp on rp.id=ro.id
join result r on r.id=rp.id
where year >2003
group by ro.organization),
--return all results group by organization
allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro
join result r on r.id=ro.id
where year >2003
group by organization)
--return results_with_pid/all_results
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults
join result_with_pid on result_with_pid.organization=allresults.organization;
compute stats indi_org_findable;
create table indi_org_openess stored as parquet as
WITH pubs_oa as (
SELECT ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
datasets_oa as (
SELECT ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
software_oa as (
SELECT ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization),
allpubs as (
SELECT ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
alldatasets as (
SELECT ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
allsoftware as (
SELECT ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software ps on ps.id=ro.id
where cast(ps.year as int)>2003
group by ro.organization),
allpubsshare as (
select pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization),
alldatasetssshare as (
select datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization),
allsoftwaresshare as (
select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization)
select allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare
left outer join (select organization,d from
alldatasetssshare) tmp1
on tmp1.organization=allpubsshare.organization
left outer join (select organization,s from
allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization;
compute stats indi_org_openess;
create table indi_org_openess_year stored as parquet as
WITH pubs_oa as (
SELECT r.year, ro.organization, count(distinct r.id) no_oapubs FROM publication r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization,r.year),
datasets_oa as (
SELECT r.year,ro.organization, count(distinct r.id) no_oadatasets FROM dataset r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization, r.year),
software_oa as (
SELECT r.year,ro.organization, count(distinct r.id) no_oasoftware FROM software r
join result_organization ro on ro.id=r.id
join result_instance ri on ri.id=r.id
where (ri.accessright = 'Open Access' or ri.accessright = 'Embargo' or ri.accessright = 'Open Source')
and cast(r.year as int)>2003
group by ro.organization, r.year),
allpubs as (
SELECT p.year,ro.organization organization, count(ro.id) no_allpubs FROM result_organization ro
join publication p on p.id=ro.id where cast(p.year as int)>2003
group by ro.organization, p.year),
alldatasets as (
SELECT d.year, ro.organization organization, count(ro.id) no_alldatasets FROM result_organization ro
join dataset d on d.id=ro.id where cast(d.year as int)>2003
group by ro.organization, d.year),
allsoftware as (
SELECT s.year,ro.organization organization, count(ro.id) no_allsoftware FROM result_organization ro
join software s on s.id=ro.id where cast(s.year as int)>2003
group by ro.organization, s.year),
allpubsshare as (
select allpubs.year, pubs_oa.organization, pubs_oa.no_oapubs/allpubs.no_allpubs p from allpubs
join pubs_oa on allpubs.organization=pubs_oa.organization where cast(allpubs.year as INT)=cast(pubs_oa.year as int)),
alldatasetssshare as (
select alldatasets.year, datasets_oa.organization, datasets_oa.no_oadatasets/alldatasets.no_alldatasets d
from alldatasets
join datasets_oa on alldatasets.organization=datasets_oa.organization where cast(alldatasets.year as INT)=cast(datasets_oa.year as int)),
allsoftwaresshare as (
select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsoftware s
from allsoftware
join software_oa on allsoftware.organization=software_oa.organization where cast(allsoftware.year as INT)=cast(software_oa.year as int))
select allpubsshare.year, allpubsshare.organization,
(p+isnull(s,0)+isnull(d,0))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare
left outer join (select year, organization,d from
alldatasetssshare) tmp1
on tmp1.organization=allpubsshare.organization and tmp1.year=allpubsshare.year
left outer join (select year, organization,s from
allsoftwaresshare) tmp2
on tmp2.organization=allpubsshare.organization and tmp2.year=allpubsshare.year;
compute stats indi_org_openess_year;
create table indi_pub_has_preprint stored as parquet as
select distinct p.id, coalesce(has_preprint, 0) as has_preprint
from publication_classifications p
left outer join (
select p.id, 1 as has_preprint
from publication_classifications p
where p.type='Preprint') tmp
on p.id= tmp.id;
compute stats indi_pub_has_preprint;
create table indi_pub_in_subscribed stored as parquet as
select distinct p.id, coalesce(is_subscription, 0) as is_subscription
from publication p
left outer join(
select p.id, 1 as is_subscription from publication p
join indi_pub_gold_oa g on p.id=g.id
join indi_pub_hybrid h on p.id=h.id
join indi_pub_in_transformative t on p.id=t.id
where g.is_gold=0 and h.is_hybrid=0 and t.is_transformative=0) tmp
on p.id=tmp.id;
compute stats indi_pub_in_subscribed;
create table indi_result_with_pid as
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from result p
left outer join (
select p.id, 1 as result_with_pid
from result_pids p) tmp
on p.id= tmp.id;
compute stats indi_result_with_pid;

View File

@ -38,7 +38,14 @@ create table TARGET.result stored as parquet as
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII) 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
'openorgs____::4ac562f0376fce3539504567649cb373' -- University of Patras
) )) foo; ) )) foo;
compute stats TARGET.result; compute stats TARGET.result;
@ -107,6 +114,9 @@ compute stats TARGET.result_sources;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics; compute stats TARGET.result_topics;
create table TARGET.result_fos stored as parquet as select * from SOURCE.result_fos orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fos;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;

View File

@ -11,13 +11,13 @@ where p.datainfo.deletedbyinference=false and p.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_organizations STORED AS PARQUET AS
SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization
from ${openaire_db_name}.relation r from ${openaire_db_name}.relation r
WHERE r.reltype = 'projectOrganization' WHERE r.reltype = 'projectOrganization' and r.source like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.project_results STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS result, r.datainfo.provenanceaction.classname as provenance
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultProject' WHERE r.reltype = 'resultProject' and r.target like '40|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
create table ${stats_db_name}.project_classification STORED AS PARQUET as create table ${stats_db_name}.project_classification STORED AS PARQUET as

View File

@ -123,6 +123,16 @@ UNION ALL
SELECT * SELECT *
FROM ${stats_db_name}.otherresearchproduct_topics; FROM ${stats_db_name}.otherresearchproduct_topics;
create table ${stats_db_name}.result_fos stored as parquet as
with
lvl1 as (select id, topic from ${stats_db_name}.result_topics where topic like '__ %' and type='Fields of Science and Technology classification'),
lvl2 as (select id, topic from ${stats_db_name}.result_topics where topic like '____ %' and type='Fields of Science and Technology classification'),
lvl3 as (select id, topic from ${stats_db_name}.result_topics where topic like '______ %' and type='Fields of Science and Technology classification')
select lvl1.id, lvl1.topic as lvl1, lvl2.topic as lvl2, lvl3.topic as lvl3
from lvl1
join lvl2 on lvl1.id=lvl2.id and substr(lvl2.topic, 1, 2)=substr(lvl1.topic, 1, 2)
join lvl3 on lvl3.id=lvl1.id and substr(lvl3.topic, 1, 4)=substr(lvl2.topic, 1, 4);
CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
@ -135,3 +145,4 @@ select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS da
FROM ${stats_db_name}.result r FROM ${stats_db_name}.result r
JOIN ${stats_db_name}.project_results pr ON r.id = pr.result JOIN ${stats_db_name}.project_results pr ON r.id = pr.result
JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id; JOIN ${stats_db_name}.project_tmp p ON p.id = pr.id;

View File

@ -801,7 +801,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.12.0]</dhp-schemas.version> <dhp-schemas.version>[2.12.2-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>