Compare commits

...

69 Commits

Author SHA1 Message Date
Giambattista Bloisi 56224e034a Fill the new mergedIds field when generating dedup records
Filter out dedup records composed of invisible records only
Filter out mergerels that have not been used when creating the dedup record (ungrouping of cliques)
2024-10-28 13:31:01 +01:00
Claudio Atzori 46dbb62598 Merge pull request '#9839: include claimed affiliation relationships' (#476) from claim-orgs into beta
Reviewed-on: #476
2024-10-25 10:12:59 +02:00
Claudio Atzori d3764265d5 Merge pull request '[dedup] avoid NPEs in the countryInference dedup utility' (#475) from dedup_countryInference_NPE into beta
Reviewed-on: #475
2024-10-25 10:12:06 +02:00
Claudio Atzori 4a9aeb6238 Merge pull request '9126-impact-indicators-wf-optimisation' (#471) from 9126-impact-indicators-wf-optimisation into beta
Reviewed-on: #471
2024-10-25 10:10:44 +02:00
Claudio Atzori 8172bee8c8 Merge pull request 'Minor fixes' (#496) from beta_fixes_oct into beta
Reviewed-on: #496
2024-10-25 10:09:56 +02:00
Miriam Baglioni e75326d6ec [FundersMatchFromCrossref] added match from CrossRef to DFG unidentified project 2024-10-25 09:13:54 +02:00
Giambattista Bloisi 6bc741715c Fix OafMapperUtilsTest.testMergePubs 2024-10-23 14:02:45 +02:00
Giambattista Bloisi aa7b8fd014 Use workingDir parameter for temporary data of ORCID enrichment 2024-10-23 14:02:17 +02:00
Giambattista Bloisi 0e34b0ece1 Fix imports: point them from the main distribution packages 2024-10-23 14:01:52 +02:00
Giambattista Bloisi 56b05cde0b Revert the changes for IgnoreUndefined management in tree evaluation 2024-10-11 10:35:15 +02:00
Claudio Atzori 62ff843334 adopting dhp-schemas:8.0.1 to support Auhtor's rawAffiliationString(s). Improved graph2hive implementation 2024-10-08 16:22:54 +02:00
Claudio Atzori d5867a1992 merged #490 2024-10-08 15:39:59 +02:00
Claudio Atzori e5df68772d [graph provision] fixed serialisation of the usage counts as measures in the XML records 2024-10-02 09:35:21 +02:00
Miriam Baglioni 7e6d12fa77 [UsageCount] fixed error
(cherry picked from commit 9c9a9562ae)
2024-10-01 15:55:07 +02:00
Miriam Baglioni 191fc3a461 [UsageCount] add check in case the datasource is not matched against those present in the graph
(cherry picked from commit b42bdd5fb3)
2024-10-01 15:54:31 +02:00
Claudio Atzori 10696f2a44 reverted procedure for creating the UsageCounts actionset 2024-10-01 15:54:13 +02:00
Claudio Atzori 5734b80861 Merge pull request 'datasource table creation split in steps' (#489) from antonis.lempesis/dnet-hadoop:beta into beta
Reviewed-on: #489
2024-09-30 16:34:38 +02:00
Antonis Lempesis f3c179658a datasource table creation split in steps 2024-09-30 17:12:21 +03:00
Miriam Baglioni b18ad035c1 Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta 2024-09-30 15:10:44 +02:00
Miriam Baglioni e430826e00 [ImportOC] fix to move original folder instead of extracted ones 2024-09-30 15:10:10 +02:00
Giambattista Bloisi c45cae447a Fix: invert the "natural" order when ordering by id lexicographically 2024-09-26 17:08:02 +02:00
Claudio Atzori 3fcafc7ed6 Merge pull request 'Latest institutions in monitor dbs' (#472) from antonis.lempesis/dnet-hadoop:beta into beta
Reviewed-on: #472
2024-09-26 09:49:01 +02:00
Miriam Baglioni 599e56dbc6 Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta 2024-09-25 17:28:23 +02:00
Claudio Atzori 6397141e56 code formatting 2024-09-25 15:27:32 +02:00
Claudio Atzori e354f9853a [OpenCitations] move the extracted contents under a backup path to avoid needing to re-download it in case of errors 2024-09-25 15:27:02 +02:00
Claudio Atzori 535a7b99f1 the metadata collection plugins using the HttpConnector2 class shall now retry instead of failing in case of UnknownHostException 2024-09-25 11:35:34 +02:00
Sandro La Bruzzo 6a097abc89 as described on ticket #9525
1. Changed the mapping applied to Crossref records: anything that has a relationship "is-review-of" must be mapped as publication of type "Review".
2. Force the hostedby of Crossref records with DOI prefix 10.3410 and 10.12703 to the H1 Connect data source.
2024-09-25 11:32:54 +02:00
Michele Artini 9754521847 Merge pull request 'fixed a bug with id' (#486) from osfPreprints_plugin into beta
Reviewed-on: #486
2024-09-25 10:02:24 +02:00
Michele Artini 54f8b4da39 Merge pull request 'fixed a bug with 'null' string' (#484) from osfPreprints_plugin into beta
Reviewed-on: #484
2024-09-24 15:19:54 +02:00
Miriam Baglioni 4d3e079590 Merge remote-tracking branch 'origin/beta' into beta 2024-09-24 14:26:29 +02:00
Michele Artini e941adbe2b fixed a bug with topic ENRICH/MORE/SUBJECT/ARXIV 2024-09-24 08:57:37 +02:00
Michele Artini fdbe629f49 removed the deletedByInference=true filter 2024-09-23 15:27:28 +02:00
Antonis Lempesis 619aa34a15 Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into beta 2024-09-23 15:25:59 +03:00
Antonis Lempesis dbea7a4072 removed duplicate line 2024-09-23 14:57:11 +03:00
Antonis Lempesis c9241dba0d Merge pull request 'convert_hive_to_spark_actions' (#1) from convert_hive_to_spark_actions into beta
Reviewed-on: antonis.lempesis/dnet-hadoop#1
2024-09-23 13:53:28 +02:00
Michele Artini 755a5aefcf Merge pull request 'osfPreprints_plugin' (#482) from osfPreprints_plugin into beta
Reviewed-on: #482
2024-09-23 10:21:34 +02:00
Michele Artini db6f137cf9 Merge pull request 'osfPreprints_plugin' (#480) from osfPreprints_plugin into beta
Reviewed-on: #480
2024-09-20 09:56:50 +02:00
Alessia 07e6e7b4d6 #9839: include claimed affiliation relationships 2024-09-16 13:41:56 +02:00
Antonis Lempesis 37ad259296 cleanup 2024-09-05 16:02:44 +03:00
Antonis Lempesis b64c144abf added new institutions 2024-09-05 16:00:09 +03:00
Serafeim Chatzopoulos b043f8a963 Remove redundant error messages from impact indicators workflow 2024-09-04 14:28:43 +03:00
Serafeim Chatzopoulos db03f85366 Remove steps for updating BIP! from the impact indicators workflow 2024-09-04 14:25:44 +03:00
Miriam Baglioni 468f2aa5a5 [AffiliationAffRo]align beta with new affiliation from publisher webpage introduced in production. AffRo collectedfrom OpenAIRE to discriminate against WebCrawl 2024-08-12 18:10:46 +02:00
Miriam Baglioni 89fcf4086c [Person]fix issue in affiliation relation id construction for person (missing ::) 2024-08-12 18:04:43 +02:00
Miriam Baglioni 8c185a7b1a resolving conflicts 2024-08-05 17:14:11 +02:00
Miriam Baglioni 985ca15264 [openaire-affiliation]removes matchings without DOI 2024-08-05 12:10:40 +02:00
Claudio Atzori 75a11d0ba5 [dedup] avoid NPEs in the countryInference dedup utility 2024-07-25 16:34:32 +02:00
Antonis Lempesis d0590e0e49 added latest institutions 2024-07-23 15:17:15 +03:00
Antonis Lempesis 7d2c0a3723 added new institutions 2024-07-23 15:10:17 +03:00
Lampros Smyrnaios e9686365a2 Improve performance of creating the "result_fos" table, by using a temp-table to cache data, which is requested multiple times. 2024-07-03 20:24:36 +03:00
Lampros Smyrnaios ce0aee21cc Improve performance of transferring the stats-DBs to another cluster and querying the DBs' tables, by ordering Spark to create up to 100 files per table, instead of thousands. 2024-07-03 20:15:33 +03:00
Lampros Smyrnaios 7b7dd32ad5 - Fix placement of some "set mapred.job.queue.name=analytics" statements and remove their unused "/*EOS*/" indicator.
- Add stacktrace-info to failed actions.
2024-07-03 19:53:24 +03:00
Lampros Smyrnaios 7ce051d766 - Update the remaining hive-actions to spark-actions.
- Update the version of shell-actions.
- Fix missing "/*EOS*/" indicators.
2024-07-03 19:49:19 +03:00
Lampros Smyrnaios aa4d7d5e20 Prioritize the rest of the stats-queries over other tasks on the cluster, by putting them in the "analytics" queue. 2024-07-03 19:14:25 +03:00
Lampros Smyrnaios 54e11b6a43 Improve performance and efficiency by rewriting the creation process of "publication", "project", "dataset", "datasource", "software", "otherresearchproduct" and "result" tables, to be performed in a single query, for each one. 2024-07-03 13:03:15 +03:00
Lampros Smyrnaios fe2275a9b0 Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into convert_hive_to_spark_actions
# Conflicts:
#	dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step14.sql
2024-06-25 20:17:47 +03:00
Lampros Smyrnaios a644a6f4fe Catch Spark-sql errors and show a log with the statement that failed. 2024-05-29 12:10:11 +03:00
Lampros Smyrnaios 888637773c Add missing "/*EOS*/" comments. 2024-05-27 12:34:49 +03:00
Lampros Smyrnaios e0ac494859 Merge branch 'beta' into convert_hive_to_spark_actions
# Conflicts:
#	dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15.sql
#	dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step15_5.sql
#	dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql
#	dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_5.sql
#	dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step2.sql
#	dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql
2024-05-27 12:27:40 +03:00
Lampros Smyrnaios 3c17183d10 Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into convert_hive_to_spark_actions 2024-04-23 17:18:16 +03:00
Lampros Smyrnaios 69a9ac7393 Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into convert_hive_to_spark_actions 2024-04-22 17:07:11 +03:00
Lampros Smyrnaios 342223f75c Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into convert_hive_to_spark_actions 2024-04-19 13:18:34 +03:00
Lampros Smyrnaios 2616971e2b dhp-stats-update: remove leftover duplicate line 2024-04-18 16:18:16 +03:00
Lampros Smyrnaios ba533d9f34 Merge branch 'beta' of https://code-repo.d4science.org/antonis.lempesis/dnet-hadoop into convert_hive_to_spark_actions 2024-04-18 15:47:56 +03:00
Lampros Smyrnaios d46b78b659 dhp-stats-update:
- Set Steps 2-7 and 9 to limit the amount of files generated by Spark, from 8000, down to 100, to improve file-transfer and querying performance.
- Allow the workflow to run up to Step10. The Step11 seems to have some issues even when using hive-action.
2024-04-18 15:40:27 +03:00
Lampros Smyrnaios 6f2ebb2a52 Revert Step8 and Step11 to use Hive again, since their "UPDATE" statements are not supported by Spark. 2024-04-18 15:35:03 +03:00
Lampros Smyrnaios ca091c0f1e dhp-stats-update:
- Fix not passing some parameters to some Spark actions.
- Allow the workflow to run up to Step7. The first 7 steps seem to work out of the box.
2024-04-17 14:03:59 +03:00
Lampros Smyrnaios 0b897f2f66 Fix and add missing "DROP TABLE" statements, in "dhp-stats-update" sql-scripts. 2024-04-16 18:17:54 +03:00
Lampros Smyrnaios db33f7727c Update "dhp-stats-update" workflow to use "spark"-actions, instead of "hive" ones.
Note: Currently the code is set to only test the "Step1".
2024-04-15 16:22:40 +03:00
118 changed files with 2148 additions and 1946 deletions

View File

@ -212,11 +212,11 @@ public class HttpConnector2 {
.format( .format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(), "Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report))); MAPPER.writeValueAsString(report)));
} catch (MalformedURLException | UnknownHostException e) { } catch (MalformedURLException e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e); throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException e) { } catch (SocketTimeoutException | SocketException | UnknownHostException e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);

View File

@ -65,7 +65,13 @@ public class RunSQLSparkJob {
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) { for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
log.info("executing: {}", statement); log.info("executing: {}", statement);
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
try {
spark.sql(statement).show(); spark.sql(statement).show();
} catch (Exception e) {
log.error("Error executing statement: {}", statement, e);
System.err.println("Error executing statement: " + statement + "\n" + e);
throw e;
}
log log
.info( .info(
"executed in {}", "executed in {}",

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2024.
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
public class HashableStructuredProperty extends StructuredProperty {
private static final long serialVersionUID = 8371670185221126045L;
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
if (value == null) {
return null;
}
final HashableStructuredProperty sp = new HashableStructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
HashableStructuredProperty hsp = new HashableStructuredProperty();
hsp.setQualifier(sp.getQualifier());
hsp.setValue(sp.getValue());
hsp.setQualifier(sp.getQualifier());
return hsp;
}
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
StructuredProperty sp = new StructuredProperty();
sp.setQualifier(hsp.getQualifier());
sp.setValue(hsp.getValue());
sp.setQualifier(hsp.getQualifier());
return sp;
}
@Override
public int hashCode() {
return new HashCodeBuilder(11, 91)
.append(getQualifier().getClassid())
.append(getQualifier().getSchemeid())
.append(getValue())
.hashCode();
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (obj == this) {
return true;
}
if (obj.getClass() != getClass()) {
return false;
}
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
return new EqualsBuilder()
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
.append(getValue(), rhs.getValue())
.isEquals();
}
}

View File

@ -43,34 +43,4 @@ public class CleaningFunctions {
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue); return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
} }
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
pid
.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
return pid;
}
public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional
.ofNullable(pidValue)
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pidType) {
// TODO add cleaning for more PID types as needed
case "doi":
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
}
return value;
}
} }

View File

@ -6,18 +6,11 @@ import org.apache.commons.lang3.StringUtils;
public class DoiCleaningRule { public class DoiCleaningRule {
public static String clean(final String doi) { public static String clean(final String doi) {
return doi if (doi == null)
.toLowerCase()
.replaceAll("\\s", "")
.replaceAll("^doi:", "")
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
}
public static String normalizeDoi(final String input) {
if (input == null)
return null; return null;
final String replaced = input final String replaced = doi
.replaceAll("\\n|\\r|\\t|\\s", "") .replaceAll("\\n|\\r|\\t|\\s", "")
.replaceAll("^doi:", "")
.toLowerCase() .toLowerCase()
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX); .replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
if (StringUtils.isEmpty(replaced)) if (StringUtils.isEmpty(replaced))
@ -32,7 +25,6 @@ public class DoiCleaningRule {
return null; return null;
return ret; return ret;
} }
} }

View File

@ -563,12 +563,24 @@ public class GraphCleaningFunctions extends CleaningFunctions {
Optional Optional
.ofNullable(i.getPid()) .ofNullable(i.getPid())
.ifPresent(pid -> { .ifPresent(pid -> {
final Set<StructuredProperty> pids = Sets.newHashSet(pid); final Set<HashableStructuredProperty> pids = pid
.stream()
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
Optional Optional
.ofNullable(i.getAlternateIdentifier()) .ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> { .ifPresent(altId -> {
final Set<StructuredProperty> altIds = Sets.newHashSet(altId); final Set<HashableStructuredProperty> altIds = altId
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); .stream()
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
i
.setAlternateIdentifier(
Sets
.difference(altIds, pids)
.stream()
.map(HashableStructuredProperty::toStructuredProperty)
.collect(Collectors.toList()));
}); });
}); });

View File

@ -175,7 +175,7 @@ public class IdentifierFactory implements Serializable {
return entity return entity
.getPid() .getPid()
.stream() .stream()
.map(CleaningFunctions::normalizePidValue) .map(PidCleaner::normalizePidValue)
.filter(CleaningFunctions::pidFilter) .filter(CleaningFunctions::pidFilter)
.collect( .collect(
Collectors Collectors
@ -207,7 +207,7 @@ public class IdentifierFactory implements Serializable {
// filter away PIDs provided by a DS that is not considered an authority for the // filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type // given PID Type
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles)) .filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
.map(CleaningFunctions::normalizePidValue) .map(PidCleaner::normalizePidValue)
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p)) .filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
.filter(CleaningFunctions::pidFilter)) .filter(CleaningFunctions::pidFilter))
.orElse(Stream.empty()); .orElse(Stream.empty());

View File

@ -96,7 +96,7 @@ public class MergeEntitiesComparator implements Comparator<Oaf> {
// id // id
if (res == 0) { if (res == 0) {
if (left instanceof OafEntity && right instanceof OafEntity) { if (left instanceof OafEntity && right instanceof OafEntity) {
res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId()); res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId());
} }
} }

View File

@ -972,7 +972,7 @@ public class MergeUtils {
private static String extractKeyFromPid(final StructuredProperty pid) { private static String extractKeyFromPid(final StructuredProperty pid) {
if (pid == null) if (pid == null)
return null; return null;
final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid); final StructuredProperty normalizedPid = PidCleaner.normalizePidValue(pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue()); return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
} }

View File

@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
if (right == null) if (right == null)
return -1; return -1;
StructuredProperty l = CleaningFunctions.normalizePidValue(left); StructuredProperty l = PidCleaner.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right); StructuredProperty r = PidCleaner.normalizePidValue(right);
return Optional return Optional
.ofNullable(l.getValue()) .ofNullable(l.getValue())

View File

@ -28,6 +28,7 @@ import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import net.minidev.json.JSONArray; import net.minidev.json.JSONArray;
import scala.collection.JavaConverters; import scala.collection.JavaConverters;
import scala.collection.Seq; import scala.collection.Seq;
@ -104,7 +105,7 @@ public class DHPUtils {
public static String generateUnresolvedIdentifier(final String pid, final String pidType) { public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid); final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim()); return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
} }

View File

@ -29,7 +29,7 @@ class IdentifierFactoryTest {
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); "publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier( verifyIdentifier(
"publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true); "publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
verifyIdentifier( verifyIdentifier(
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true); "publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
@ -41,7 +41,7 @@ class IdentifierFactoryTest {
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true); "publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
verifyIdentifier( verifyIdentifier(
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true); "publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
verifyIdentifier( verifyIdentifier(
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); "publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);

View File

@ -179,7 +179,7 @@ class OafMapperUtilsTest {
assertEquals( assertEquals(
ModelConstants.DATASET_RESULTTYPE_CLASSID, ModelConstants.DATASET_RESULTTYPE_CLASSID,
((Result) MergeUtils ((Result) MergeUtils
.merge(p2, d1)) .merge(p2, d1, true))
.getResulttype() .getResulttype()
.getClassid()); .getClassid());
} }

View File

@ -29,7 +29,7 @@
}, },
{ {
"qualifier": {"classid": "pmc"}, "qualifier": {"classid": "pmc"},
"value": "21459329" "value": "PMC21459329"
} }
] ]
} }

View File

@ -13,7 +13,7 @@
}, },
{ {
"qualifier":{"classid":"pmc"}, "qualifier":{"classid":"pmc"},
"value":"21459329" "value":"PMC21459329"
} }
] ]
} }

View File

@ -90,7 +90,7 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
inferFrom = normalize(inferFrom); inferFrom = normalize(inferFrom);
inferFrom = filterAllStopWords(inferFrom); inferFrom = filterAllStopWords(inferFrom);
Set<String> cities = getCities(inferFrom, 4); Set<String> cities = getCities(inferFrom, 4);
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN"); return citiesToCountry(cities).stream().filter(Objects::nonNull).findFirst().orElse("UNKNOWN");
} }
public static String cityInference(String original) { public static String cityInference(String original) {

View File

@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
// function for the evaluation of the node // function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) { public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined); TreeNodeStats stats = new TreeNodeStats();
// for each field in the node, it computes the // for each field in the node, it computes the
for (FieldConf fieldConf : fields) { for (FieldConf fieldConf : fields) {

View File

@ -9,11 +9,8 @@ public class TreeNodeStats implements Serializable {
private Map<String, FieldStats> results; // this is an accumulator for the results of the node private Map<String, FieldStats> results; // this is an accumulator for the results of the node
private final boolean ignoreUndefined; public TreeNodeStats() {
public TreeNodeStats(boolean ignoreUndefined) {
this.results = new HashMap<>(); this.results = new HashMap<>();
this.ignoreUndefined = ignoreUndefined;
} }
public Map<String, FieldStats> getResults() { public Map<String, FieldStats> getResults() {
@ -25,10 +22,7 @@ public class TreeNodeStats implements Serializable {
} }
public int fieldsCount() { public int fieldsCount() {
if (ignoreUndefined)
return this.results.size(); return this.results.size();
else
return this.results.size() - undefinedCount(); // do not count undefined
} }
public int undefinedCount() { public int undefinedCount() {
@ -84,23 +78,12 @@ public class TreeNodeStats implements Serializable {
double min = 100.0; // random high value double min = 100.0; // random high value
for (FieldStats fs : this.results.values()) { for (FieldStats fs : this.results.values()) {
if (fs.getResult() < min) { if (fs.getResult() < min) {
if (fs.getResult() == -1) { if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
if (fs.isCountIfUndefined()) {
min = 0.0;
} else {
min = -1;
}
} else {
min = fs.getResult(); min = fs.getResult();
} }
} }
}
if (ignoreUndefined) {
return min == -1.0 ? 0.0 : min;
} else {
return min; return min;
} }
}
// if at least one is true, return 1.0 // if at least one is true, return 1.0
public double or() { public double or() {
@ -108,12 +91,8 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() >= fieldStats.getThreshold()) if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0; return 1.0;
} }
if (!ignoreUndefined && undefinedCount() > 0) {
return -1.0;
} else {
return 0.0; return 0.0;
} }
}
// if at least one is false, return 0.0 // if at least one is false, return 0.0
public double and() { public double and() {
@ -121,7 +100,7 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() == -1) { if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined()) if (fieldStats.isCountIfUndefined())
return ignoreUndefined ? 0.0 : -1.0; return 0.0;
} else { } else {
if (fieldStats.getResult() < fieldStats.getThreshold()) if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0; return 0.0;

View File

@ -44,10 +44,12 @@ public class TreeProcessor {
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(nextNodeName, stats); treeStats.addNodeStats(nextNodeName, stats);
double finalScore = stats.getFinalScore(currentNode.getAggregation()); // if ignoreUndefined=false the miss is considered as undefined
if (finalScore == -1.0) if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
nextNodeName = currentNode.getUndefined(); nextNodeName = currentNode.getUndefined();
else if (finalScore >= currentNode.getThreshold()) { }
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive(); nextNodeName = currentNode.getPositive();
} else { } else {
nextNodeName = currentNode.getNegative(); nextNodeName = currentNode.getNegative();

View File

@ -1,8 +1,7 @@
package eu.dnetlib.pace.common; package eu.dnetlib.pace.common;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
@ -54,8 +53,17 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
} }
@Test()
public void countryInferenceTest_NPE() {
assertThrows(
NullPointerException.class,
() -> countryInference("UNKNOWN", null),
"Expected countryInference() to throw an NPE");
}
@Test @Test
public void countryInferenceTest() { public void countryInferenceTest() {
assertEquals("UNKNOWN", countryInference("UNKNOWN", ""));
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna")); assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
assertEquals("UK", countryInference("UK", "Università di Bologna")); assertEquals("UK", countryInference("UK", "Università di Bologna"));
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples")); assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));

View File

@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest { public class UtilTest {

View File

@ -10,7 +10,6 @@ import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
@ -29,6 +28,7 @@ import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.DoiCleaningRule;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
@ -44,6 +44,10 @@ public class PrepareAffiliationRelations implements Serializable {
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference"; public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE"; public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation"; public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
public static final String DOI_URL_PREFIX = "https://doi.org/";
public static final int DOI_URL_PREFIX_LENGTH = 16;
public static <I extends Result> void main(String[] args) throws Exception { public static <I extends Result> void main(String[] args) throws Exception {
@ -74,6 +78,9 @@ public class PrepareAffiliationRelations implements Serializable {
final String webcrawlInputPath = parser.get("webCrawlInputPath"); final String webcrawlInputPath = parser.get("webCrawlInputPath");
log.info("webcrawlInputPath: {}", webcrawlInputPath); log.info("webcrawlInputPath: {}", webcrawlInputPath);
final String publisherInputPath = parser.get("publisherInputPath");
log.info("publisherInputPath: {}", publisherInputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
@ -84,41 +91,72 @@ public class PrepareAffiliationRelations implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Constants.removeOutputDir(spark, outputPath); Constants.removeOutputDir(spark, outputPath);
createActionSet(
spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
publisherInputPath, outputPath);
});
}
List<KeyValue> collectedFromCrossref = OafMapperUtils private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref"); String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations( String outputPath) {
spark, crossrefInputPath, collectedFromCrossref); List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
spark, crossrefInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromPubmed = OafMapperUtils
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
spark, pubmedInputPath, collectedFromPubmed); spark, pubmedInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromOpenAPC = OafMapperUtils JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC"); spark, openapcInputPath, collectedfromOpenAIRE);
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite); spark, dataciteInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedFromWebCrawl); spark, webcrawlInputPath, collectedfromOpenAIRE);
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
spark, publisherlInputPath, collectedfromOpenAIRE);
crossrefRelations crossrefRelations
.union(pubmedRelations) .union(pubmedRelations)
.union(openAPCRelations) .union(openAPCRelations)
.union(dataciteRelations) .union(dataciteRelations)
.union(webCrawlRelations) .union(webCrawlRelations)
.union(publisherRelations)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
List<KeyValue> collectedfrom) {
Dataset<Row> df = spark
.read()
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
});
} }
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark, private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
@ -132,6 +170,24 @@ public class PrepareAffiliationRelations implements Serializable {
.json(inputPath) .json(inputPath)
.where("DOI is not null"); .where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df);
}
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
// load and parse affiliation relations from HDFS
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDDNew(collectedfrom, df);
}
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays // unroll nested arrays
df = df df = df
.withColumn("matching", functions.explode(new Column("Matchings"))) .withColumn("matching", functions.explode(new Column("Matchings")))
@ -147,7 +203,7 @@ public class PrepareAffiliationRelations implements Serializable {
// DOI to OpenAIRE id // DOI to OpenAIRE id
final String paperId = ID_PREFIX final String paperId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi"))); + IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
// ROR id to OpenAIRE id // ROR id to OpenAIRE id
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid")); final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
@ -179,6 +235,69 @@ public class PrepareAffiliationRelations implements Serializable {
new Text(OBJECT_MAPPER.writeValueAsString(aa)))); new Text(OBJECT_MAPPER.writeValueAsString(aa))));
} }
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays
df = df
.withColumn("matching", functions.explode(new Column("Matchings")))
.select(
new Column("DOI").as("doi"),
new Column("matching.PID").as("pidtype"),
new Column("matching.Value").as("pidvalue"),
new Column("matching.Confidence").as("confidence"),
new Column("matching.Status").as("status"))
.where("status = 'active'");
// prepare action sets for affiliation relations
return df
.toJavaRDD()
.flatMap((FlatMapFunction<Row, Relation>) row -> {
// DOI to OpenAIRE id
final String paperId = ID_PREFIX
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
// Organization to OpenAIRE identifier
String affId = null;
if (row.getAs("pidtype").equals("ROR"))
// ROR id to OpenIARE id
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
else
// getting the OpenOrgs identifier for the organization
affId = row.getAs("pidvalue");
Qualifier qualifier = OafMapperUtils
.qualifier(
BIP_AFFILIATIONS_CLASSID,
BIP_AFFILIATIONS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS);
// format data info; setting `confidence` into relation's `trust`
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
BIP_INFERENCE_PROVENANCE,
true,
false,
qualifier,
Double.toString(row.getAs("confidence")));
// return bi-directional relations
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
})
.map(p -> new AtomicAction(Relation.class, p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
}
private static String removePrefix(String doi) {
if (doi.startsWith(DOI_URL_PREFIX))
return doi.substring(DOI_URL_PREFIX_LENGTH);
return doi;
}
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom, private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
DataInfo dataInfo) { DataInfo dataInfo) {
return Arrays return Arrays

View File

@ -46,6 +46,9 @@ public class GetOpenCitationsRefs implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath); log.info("outputPath {}", outputPath);
final String backupPath = parser.get("backupPath");
log.info("backupPath {}", backupPath);
Configuration conf = new Configuration(); Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.defaultFS", hdfsNameNode);
@ -53,11 +56,11 @@ public class GetOpenCitationsRefs implements Serializable {
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs(); GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
ocr.doExtract(inputPath, outputPath, fileSystem); ocr.doExtract(inputPath, outputPath, backupPath, fileSystem);
} }
private void doExtract(String inputPath, String outputPath, FileSystem fileSystem) private void doExtract(String inputPath, String outputPath, String backupPath, FileSystem fileSystem)
throws IOException { throws IOException {
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
@ -89,6 +92,7 @@ public class GetOpenCitationsRefs implements Serializable {
} }
} }
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
} }
} }

View File

@ -107,7 +107,8 @@ public class ReadCOCI implements Serializable {
.mode(SaveMode.Append) .mode(SaveMode.Append)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
fileSystem.delete(fileStatus.getPath());
} }
} }

View File

@ -11,6 +11,7 @@ import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@ -20,7 +21,6 @@ import org.apache.spark.sql.*;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.spark_project.jetty.util.StringUtil;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -297,7 +297,7 @@ public class ExtractPerson implements Serializable {
} }
private static Relation getAffiliationRelation(Employment row) { private static Relation getAffiliationRelation(Employment row) {
String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid()); String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(row.getOrcid());
String target = ROR_PREFIX String target = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue())); + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
List<KeyValue> properties = new ArrayList<>(); List<KeyValue> properties = new ArrayList<>();
@ -317,13 +317,13 @@ public class ExtractPerson implements Serializable {
"0.91"), "0.91"),
null); null);
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) { if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) {
KeyValue kv = new KeyValue(); KeyValue kv = new KeyValue();
kv.setKey("startDate"); kv.setKey("startDate");
kv.setValue(row.getStartDate()); kv.setValue(row.getStartDate());
properties.add(kv); properties.add(kv);
} }
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) { if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) {
KeyValue kv = new KeyValue(); KeyValue kv = new KeyValue();
kv.setKey("endDate"); kv.setKey("endDate");
kv.setValue(row.getEndDate()); kv.setValue(row.getEndDate());

View File

@ -69,9 +69,12 @@ public class CollectorWorker extends ReportingJob {
scheduleReport(counter); scheduleReport(counter);
try (SequenceFile.Writer writer = SequenceFile try (SequenceFile.Writer writer = SequenceFile
.createWriter(this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer .createWriter(
.keyClass(IntWritable.class), SequenceFile.Writer this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
.valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { .keyClass(IntWritable.class),
SequenceFile.Writer
.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final IntWritable key = new IntWritable(counter.get()); final IntWritable key = new IntWritable(counter.get());
final Text value = new Text(); final Text value = new Text();
plugin plugin

View File

@ -36,7 +36,9 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT)) .map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
.orElse(PAGE_SIZE_VALUE_DEFAULT); .orElse(PAGE_SIZE_VALUE_DEFAULT);
if (StringUtils.isBlank(baseUrl)) { throw new CollectorException("Param 'baseUrl' is null or empty"); } if (StringUtils.isBlank(baseUrl)) {
throw new CollectorException("Param 'baseUrl' is null or empty");
}
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams()); final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());

View File

@ -54,7 +54,8 @@ public class OsfPreprintsIterator implements Iterator<String> {
@Override @Override
public boolean hasNext() { public boolean hasNext() {
synchronized (this.recordQueue) { synchronized (this.recordQueue) {
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl) && this.currentUrl.startsWith("http")) { while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl)
&& this.currentUrl.startsWith("http")) {
try { try {
this.currentUrl = downloadPage(this.currentUrl); this.currentUrl = downloadPage(this.currentUrl);
} catch (final CollectorException e) { } catch (final CollectorException e) {
@ -63,7 +64,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
} }
} }
if (!this.recordQueue.isEmpty()) { return true; } if (!this.recordQueue.isEmpty()) {
return true;
}
return false; return false;
} }
@ -112,7 +115,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
} }
private Document downloadUrl(final String url, final int attempt) throws CollectorException { private Document downloadUrl(final String url, final int attempt) throws CollectorException {
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); } if (attempt > MAX_ATTEMPTS) {
throw new CollectorException("Max Number of attempts reached, url:" + url);
}
if (attempt > 0) { if (attempt > 0) {
final int delay = (attempt * 5000); final int delay = (attempt * 5000);

View File

@ -28,13 +28,19 @@
"paramLongName": "dataciteInputPath", "paramLongName": "dataciteInputPath",
"paramDescription": "the path to get the input data from Datacite", "paramDescription": "the path to get the input data from Datacite",
"paramRequired": true "paramRequired": true
},{ },
{
"paramName": "wip", "paramName": "wip",
"paramLongName": "webCrawlInputPath", "paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl", "paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true "paramRequired": true
} },
, {
"paramName": "pub",
"paramLongName": "publisherInputPath",
"paramDescription": "the path to get the input data from publishers",
"paramRequired": true
},
{ {
"paramName": "o", "paramName": "o",
"paramLongName": "outputPath", "paramLongName": "outputPath",

View File

@ -16,5 +16,11 @@
"paramLongName": "hdfsNameNode", "paramLongName": "hdfsNameNode",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": true "paramRequired": true
},
{
"paramName": "bp",
"paramLongName": "backupPath",
"paramDescription": "the hdfs path to move the OC data after the extraction",
"paramRequired": true
} }
] ]

View File

@ -24,12 +24,13 @@
"paramLongName": "outputPath", "paramLongName": "outputPath",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": true "paramRequired": true
}, { },
{
"paramName": "nn", "paramName": "nn",
"paramLongName": "hdfsNameNode", "paramLongName": "hdfsNameNode",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": true "paramRequired": true
} }
] ]

View File

@ -94,17 +94,7 @@
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg> <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputPath</arg><arg>${inputPath}/Original</arg> <arg>--inputPath</arg><arg>${inputPath}/Original</arg>
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg> <arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
</java> <arg>--backupPath</arg><arg>${inputPath}/backup</arg>
<ok to="read"/>
<error to="Kill"/>
</action>
<action name="extract_correspondence">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputPath</arg><arg>${inputPath}/correspondence</arg>
<arg>--outputPath</arg><arg>${inputPath}/correspondence_extracted</arg>
</java> </java>
<ok to="read"/> <ok to="read"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{
PidType PidType
} }
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang.StringUtils import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.Row import org.apache.spark.sql.Row
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
@ -332,7 +332,7 @@ case object Crossref2Oaf {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID //MAPPING Crossref DOI into PID
val doi: String = DoiCleaningRule.normalizeDoi((json \ "DOI").extract[String]) val doi: String = DoiCleaningRule.clean((json \ "DOI").extract[String])
result.setPid( result.setPid(
List( List(
structuredProperty( structuredProperty(
@ -504,6 +504,24 @@ case object Crossref2Oaf {
) )
} }
val is_review = json \ "relation" \ "is-review-of" \ "id"
if (is_review != JNothing) {
instance.setInstancetype(
OafMapperUtils.qualifier(
"0015",
"peerReviewed",
ModelConstants.DNET_REVIEW_LEVELS,
ModelConstants.DNET_REVIEW_LEVELS
)
)
}
if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
instance.setHostedby(
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
)
instance.setAccessright( instance.setAccessright(
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue) decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
) )
@ -655,7 +673,7 @@ case object Crossref2Oaf {
val doi = input.getString(0) val doi = input.getString(0)
val rorId = input.getString(1) val rorId = input.getString(1)
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.normalizeDoi(doi)}" val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId) val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
val r: Relation = new Relation val r: Relation = new Relation

View File

@ -407,10 +407,9 @@ object DataciteToOAFTransformation {
) )
} }
if (c.affiliation.isDefined) if (c.affiliation.isDefined)
a.setAffiliation( a.setRawAffiliationString(
c.affiliation.get c.affiliation.get
.filter(af => af.nonEmpty) .filter(af => af.nonEmpty)
.map(af => OafMapperUtils.field(af, dataInfo))
.asJava .asJava
) )
a.setRank(idx + 1) a.setRank(idx + 1)

View File

@ -28,8 +28,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class PrepareAffiliationRelationsTest { public class PrepareAffiliationRelationsTest {
@ -39,8 +39,7 @@ public class PrepareAffiliationRelationsTest {
private static Path workingDir; private static Path workingDir;
private static final String ID_PREFIX = "50|doi_________::"; private static final String ID_PREFIX = "50|doi_________::";
private static final Logger log = LoggerFactory private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelationsTest.class);
.getLogger(PrepareAffiliationRelationsTest.class);
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
@ -74,21 +73,34 @@ public class PrepareAffiliationRelationsTest {
@Test @Test
void testMatch() throws Exception { void testMatch() throws Exception {
String crossrefAffiliationRelationPath = getClass() String crossrefAffiliationRelationPathNew = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath(); .getPath();
String crossrefAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror_old.json")
.getPath();
String publisherAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
.getPath();
String publisherAffiliationRelationOldPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publichers_old")
.getPath();
String outputPath = workingDir.toString() + "/actionSet"; String outputPath = workingDir.toString() + "/actionSet";
PrepareAffiliationRelations PrepareAffiliationRelations
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-crossrefInputPath", crossrefAffiliationRelationPath, "-crossrefInputPath", crossrefAffiliationRelationPathNew,
"-pubmedInputPath", crossrefAffiliationRelationPath, "-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPathNew,
"-dataciteInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath, "-webCrawlInputPath", crossrefAffiliationRelationPath,
"-publisherInputPath", publisherAffiliationRelationOldPath,
"-outputPath", outputPath "-outputPath", outputPath
}); });
@ -99,13 +111,8 @@ public class PrepareAffiliationRelationsTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
// for (Relation r : tmp.collect()) {
// System.out.println(
// r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
// );
// }
// count the number of relations // count the number of relations
assertEquals(120, tmp.count()); assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result"); dataset.createOrReplaceTempView("result");
@ -116,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations // verify that we have equal number of bi-directional relations
Assertions Assertions
.assertEquals( .assertEquals(
60, execVerification 75, execVerification
.filter( .filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList() .collectAsList()
@ -124,26 +131,56 @@ public class PrepareAffiliationRelationsTest {
Assertions Assertions
.assertEquals( .assertEquals(
60, execVerification 75, execVerification
.filter( .filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList() .collectAsList()
.size()); .size());
// check confidence value of a specific relation // check confidence value of a specific relation
String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)"; String sourceDOI = "10.1089/10872910260066679";
final String sourceOpenaireId = ID_PREFIX final String sourceOpenaireId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI)); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", sourceDOI));
Assertions Assertions
.assertEquals( .assertEquals(
"0.7071067812", execVerification "1.0", execVerification
.filter( .filter(
"source='" + sourceOpenaireId + "'") "source='" + sourceOpenaireId + "'")
.collectAsList() .collectAsList()
.get(0) .get(0)
.getString(4)); .getString(4));
final String publisherid = ID_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1089/10872910260066679"));
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/05cf8a891");
Assertions
.assertEquals(
2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
Assertions
.assertEquals(
1, execVerification
.filter(
"source = '" + ID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue("doi", "10.1007/s00217-010-1268-9"))
+ "' and target = '" + "20|ror_________::"
+ IdentifierFactory.md5("https://ror.org/03265fv13") + "'")
.count());
Assertions
.assertEquals(
3, execVerification
.filter(
"source = '" + ID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue("doi", "10.1007/3-540-47984-8_14"))
+ "' and target = '" + "20|ror_________::"
+ IdentifierFactory.md5("https://ror.org/00a0n9e72") + "'")
.count());
} }
} }

View File

@ -31,6 +31,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class CreateOpenCitationsASTest { public class CreateOpenCitationsASTest {
@ -280,17 +281,17 @@ public class CreateOpenCitationsASTest {
@Test @Test
void testRelationsSourceTargetCouple() throws Exception { void testRelationsSourceTargetCouple() throws Exception {
final String doi1 = "50|doi_________::" final String doi1 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
final String doi2 = "50|doi_________::" final String doi2 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
final String doi3 = "50|doi_________::" final String doi3 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
final String doi4 = "50|doi_________::" final String doi4 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
final String doi5 = "50|doi_________::" final String doi5 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
final String doi6 = "50|doi_________::" final String doi6 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
String inputPath = getClass() String inputPath = getClass()
.getResource( .getResource(

View File

@ -28,6 +28,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
@ -270,17 +271,17 @@ public class CreateTAActionSetTest {
@Test @Test
void testRelationsSourceTargetCouple() throws Exception { void testRelationsSourceTargetCouple() throws Exception {
final String doi1 = "50|doi_________::" final String doi1 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
final String doi2 = "50|doi_________::" final String doi2 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
final String doi3 = "50|doi_________::" final String doi3 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
final String doi4 = "50|doi_________::" final String doi4 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
final String doi5 = "50|doi_________::" final String doi5 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
final String doi6 = "50|doi_________::" final String doi6 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5")); + IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
String inputPath = getClass() String inputPath = getClass()
.getResource( .getResource(

View File

@ -50,7 +50,8 @@ public class OsfPreprintsCollectorPluginTest {
@Test @Test
@Disabled @Disabled
void test_one() throws CollectorException { void test_one() throws CollectorException {
this.plugin.collect(this.api, new AggregatorReport()) this.plugin
.collect(this.api, new AggregatorReport())
.limit(1) .limit(1)
.forEach(log::info); .forEach(log::info);
} }
@ -95,7 +96,8 @@ public class OsfPreprintsCollectorPluginTest {
final HttpConnector2 connector = new HttpConnector2(); final HttpConnector2 connector = new HttpConnector2();
try { try {
final String res = connector.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json"); final String res = connector
.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
System.out.println(res); System.out.println(res);
fail(); fail();
} catch (final Throwable e) { } catch (final Throwable e) {

View File

@ -1,9 +1,10 @@
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]} {"DOI":"10.1021\/ac020069k","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/01f5ytq51","Status":"active","Confidence":1}]}
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]} {"DOI":"10.1161\/01.cir.0000013846.72805.7e","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02pttbw34","Status":"active","Confidence":1}]}
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]} {"DOI":"10.1161\/hy02t2.102992","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/00qqv6244","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/00p991c53","Status":"active","Confidence":1}]}
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]} {"DOI":"10.1126\/science.1073633","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03xez1567","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/006w34k90","Status":"active","Confidence":1}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]} {"DOI":"10.1089\/10872910260066679","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/05cf8a891","Status":"active","Confidence":1}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]} {"DOI":"10.1108\/02656719610116117","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03mnm0t94","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/007tn5k56","Status":"active","Confidence":1}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} {"DOI":"10.1080\/01443610050111986","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/001x4vz59","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/01tmqtf75","Status":"active","Confidence":1}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]} {"DOI":"10.1021\/cm020118+","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02cf1je33","Confidence":1,"Status":"inactive"},{"PID":"ROR","Value":"https:\/\/ror.org\/01hvx5h04","Confidence":1,"Status":"active"}]}
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]} {"DOI":"10.1161\/hc1202.104524","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/040r8fr65","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/04fctr677","Status":"active","Confidence":1}]}
{"DOI":"10.1021\/ma011134f","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/04tj63d06","Status":"active","Confidence":1}]}

View File

@ -0,0 +1,9 @@
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI": "https://doi.org/10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}

View File

@ -0,0 +1,6 @@
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}, {"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}, {"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}

View File

@ -0,0 +1,6 @@
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03bea9k73", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04agmb972", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}

View File

@ -70,9 +70,8 @@ public class PrepareRelatedProjectsJob {
final Dataset<Relation> rels = ClusterUtils final Dataset<Relation> rels = ClusterUtils
.loadRelations(graphPath, spark) .loadRelations(graphPath, spark)
.filter((FilterFunction<Relation>) r -> r.getDataInfo().getDeletedbyinference()) .filter((FilterFunction<Relation>) r -> ModelConstants.RESULT_PROJECT.equals(r.getRelType()))
.filter((FilterFunction<Relation>) r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) .filter((FilterFunction<Relation>) r -> !BrokerConstants.IS_MERGED_IN_CLASS.equals(r.getRelClass()))
.filter((FilterFunction<Relation>) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget())); .filter((FilterFunction<Relation>) r -> !ClusterUtils.isDedupRoot(r.getTarget()));

View File

@ -53,7 +53,7 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
.collect(Collectors.toSet()); .collect(Collectors.toSet());
return source return source
.getPids() .getSubjects()
.stream() .stream()
.filter(s -> !existingSubjects.contains(subjectAsString(s))) .filter(s -> !existingSubjects.contains(subjectAsString(s)))
.collect(Collectors.toList()); .collect(Collectors.toList());

View File

@ -0,0 +1,60 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Arrays;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
public class EnrichMoreSubjectTest {
final EnrichMoreSubject matcher = new EnrichMoreSubject();
@BeforeEach
void setUp() throws Exception {
}
@Test
void testFindDifferences_1() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
@Test
void testFindDifferences_2() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
assertEquals(1, list.size());
}
@Test
void testFindDifferences_3() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
@Test
void testFindDifferences_4() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01")));
final List<OaBrokerTypedValue> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
}

View File

@ -2,14 +2,13 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.FlatMapGroupsFunction; import org.apache.spark.api.java.function.FlatMapGroupsFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import eu.dnetlib.dhp.oa.dedup.model.Identifier; import eu.dnetlib.dhp.oa.dedup.model.Identifier;
@ -107,6 +106,8 @@ public class DedupRecordFactory {
final HashSet<String> acceptanceDate = new HashSet<>(); final HashSet<String> acceptanceDate = new HashSet<>();
boolean isVisible = false;
while (it.hasNext()) { while (it.hasNext()) {
Tuple3<String, String, OafEntity> t = it.next(); Tuple3<String, String, OafEntity> t = it.next();
OafEntity entity = t._3(); OafEntity entity = t._3();
@ -114,6 +115,7 @@ public class DedupRecordFactory {
if (entity == null) { if (entity == null) {
aliases.add(t._2()); aliases.add(t._2());
} else { } else {
isVisible = isVisible || !entity.getDataInfo().getInvisible();
cliques.add(entity); cliques.add(entity);
if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) { if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
@ -129,13 +131,20 @@ public class DedupRecordFactory {
} }
if (acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) { if (!isVisible || acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) {
return Collections.emptyIterator(); return Collections.emptyIterator();
} }
OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator()); OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator());
// dedup records do not have date of transformation attribute // dedup records do not have date of transformation attribute
mergedEntity.setDateoftransformation(null); mergedEntity.setDateoftransformation(null);
mergedEntity
.setMergedIds(
Stream
.concat(cliques.stream().map(OafEntity::getId), aliases.stream())
.distinct()
.sorted()
.collect(Collectors.toList()));
return Stream return Stream
.concat( .concat(

View File

@ -5,11 +5,11 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTION
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP; import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.*;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -17,6 +17,7 @@ import org.xml.sax.SAXException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
@ -25,6 +26,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import scala.collection.JavaConverters;
public class SparkCreateDedupRecord extends AbstractSparkAction { public class SparkCreateDedupRecord extends AbstractSparkAction {
@ -85,6 +87,36 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
log.info("Updating mergerels for: '{}'", subEntity);
final Dataset<Row> dedupIds = spark
.read()
.schema("`id` STRING, `mergedIds` ARRAY<STRING>")
.json(outputPath)
.selectExpr("id as source", "explode(mergedIds) as target");
spark
.read()
.load(mergeRelPath)
.where("relClass == 'merges'")
.join(dedupIds, JavaConverters.asScalaBuffer(Arrays.asList("source", "target")).toSeq(), "left_semi")
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.save(workingPath + "/mergerel_filtered");
final Dataset<Row> validRels = spark.read().load(workingPath + "/mergerel_filtered");
final Dataset<Row> filteredMergeRels = validRels
.union(
validRels
.withColumnRenamed("source", "source_tmp")
.withColumnRenamed("target", "target_tmp")
.withColumn("relClass", functions.lit(ModelConstants.IS_MERGED_IN))
.withColumnRenamed("target_tmp", "source")
.withColumnRenamed("source_tmp", "target"));
saveParquet(filteredMergeRels, mergeRelPath, SaveMode.Overwrite);
removeOutputDir(spark, workingPath + "/mergerel_filtered");
} }
} }

View File

@ -7,7 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactor
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.apache.commons.lang.StringUtils import org.apache.commons.lang3.StringUtils
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST._ import org.json4s.JsonAST._
@ -560,7 +560,11 @@ case object Crossref2Oaf {
"10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" | "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" |
"10.13039/501100013589" | "10.13039/501100000271" => "10.13039/501100013589" | "10.13039/501100000271" =>
generateSimpleRelationFromAward(funder, "ukri________", a => a) generateSimpleRelationFromAward(funder, "ukri________", a => a)
//DFG
case "10.13039/501100001659" =>
val targetId = getProjectId("dfgf________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case _ => logger.debug("no match for " + funder.DOI.get) case _ => logger.debug("no match for " + funder.DOI.get)
} }

View File

@ -313,7 +313,7 @@ case object ConversionUtil {
if (f.author.DisplayName.isDefined) if (f.author.DisplayName.isDefined)
a.setFullname(f.author.DisplayName.get) a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null) if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava) a.setRawAffiliationString(List(f.affiliation).asJava)
a.setPid( a.setPid(
List( List(
createSP( createSP(
@ -386,7 +386,7 @@ case object ConversionUtil {
a.setFullname(f.author.DisplayName.get) a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null) if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava) a.setRawAffiliationString(List(f.affiliation).asJava)
a.setPid( a.setPid(
List( List(

View File

@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
import org.apache.commons.lang.StringUtils import org.apache.commons.lang3.StringUtils
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST._ import org.json4s.JsonAST._

View File

@ -6,11 +6,11 @@ import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.amazonaws.util.StringUtils;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
@ -81,7 +81,7 @@ public class Utils implements Serializable {
Community c = new Community(); Community c = new Community();
c.setId(cm.getId()); c.setId(cm.getId());
c.setZenodoCommunities(cm.getOtherZenodoCommunities()); c.setZenodoCommunities(cm.getOtherZenodoCommunities());
if (!StringUtils.isNullOrEmpty(cm.getZenodoCommunity())) if (StringUtils.isNotBlank(cm.getZenodoCommunity()))
c.getZenodoCommunities().add(cm.getZenodoCommunity()); c.getZenodoCommunities().add(cm.getZenodoCommunity());
c.setSubjects(cm.getSubjects()); c.setSubjects(cm.getSubjects());
c.getSubjects().addAll(cm.getFos()); c.getSubjects().addAll(cm.getFos());

View File

@ -4,7 +4,7 @@ package eu.dnetlib.dhp.bulktag.community;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
import org.apache.htrace.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
import eu.dnetlib.dhp.bulktag.criteria.Selection; import eu.dnetlib.dhp.bulktag.criteria.Selection;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;

View File

@ -9,10 +9,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.*;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -25,8 +22,6 @@ public class GraphHiveTableImporterJob {
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class); private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -74,7 +69,12 @@ public class GraphHiveTableImporterJob {
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName, private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
Class<T> clazz, int numPartitions) { Class<T> clazz, int numPartitions) {
Dataset<String> dataset = spark.read().textFile(inputPath); final Encoder<T> clazzEncoder = Encoders.bean(clazz);
Dataset<Row> dataset = spark
.read()
.schema(clazzEncoder.schema())
.json(inputPath);
if (numPartitions > 0) { if (numPartitions > 0) {
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions); log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
@ -82,7 +82,6 @@ public class GraphHiveTableImporterJob {
} }
dataset dataset
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.saveAsTable(tableIdentifier(hiveDbName, clazz)); .saveAsTable(tableIdentifier(hiveDbName, clazz));

View File

@ -55,29 +55,7 @@ import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.AccessRight; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.EoscIfGuidelines;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.InstanceTypeMapping;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -667,22 +645,25 @@ public abstract class AbstractMdRecordToOafMapper {
return this.vocs.getTermAsQualifier(schemeId, classId); return this.vocs.getTermAsQualifier(schemeId, classId);
} }
protected List<StructuredProperty> prepareListStructPropsWithValidQualifier( protected List<HashableStructuredProperty> prepareListStructPropsWithValidQualifier(
final Node node, final Node node,
final String xpath, final String xpath,
final String xpathClassId, final String xpathClassId,
final String schemeId, final String schemeId,
final DataInfo info) { final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final Set<HashableStructuredProperty> res = new HashSet<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId).trim(); final String classId = n.valueOf(xpathClassId).trim();
if (this.vocs.termExists(schemeId, classId)) { if (this.vocs.termExists(schemeId, classId)) {
res.add(structuredProperty(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info)); res
.add(
HashableStructuredProperty
.newInstance(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
} }
} }
return res; return Lists.newArrayList(res);
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(

View File

@ -519,6 +519,28 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO); r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO); r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO);
break; break;
case "resultOrganization_affiliation_isAuthorInstitutionOf":
if (!"organization".equals(sourceType)) {
throw new IllegalStateException(
String
.format(
"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
semantics));
}
r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
break;
case "resultOrganization_affiliation_hasAuthorInstitution":
if (!"organization".equals(targetType)) {
throw new IllegalStateException(
String
.format(
"invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId,
semantics));
}
r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION);
r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF);
break;
default: default:
throw new IllegalArgumentException("claim semantics not managed: " + semantics); throw new IllegalArgumentException("claim semantics not managed: " + semantics);
} }

View File

@ -25,6 +25,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@ -380,7 +381,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
return prepareListStructPropsWithValidQualifier( return prepareListStructPropsWithValidQualifier(
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info) doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
.stream() .stream()
.map(CleaningFunctions::normalizePidValue) .map(PidCleaner::normalizePidValue)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.schema.common.RelationInverse;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@ -93,7 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
} }
author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info)); author.setRawAffiliationString(prepareListString(n, "./*[local-name()='affiliation']"));
author.setPid(preparePids(n, info)); author.setPid(preparePids(n, info));
author.setRank(pos++); author.setRank(pos++);
res.add(author); res.add(author);
@ -504,7 +505,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
final Set<StructuredProperty> res = new HashSet<>(); final Set<HashableStructuredProperty> res = new HashSet<>();
res res
.addAll( .addAll(
prepareListStructPropsWithValidQualifier( prepareListStructPropsWithValidQualifier(
@ -524,7 +525,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
return res return res
.stream() .stream()
.map(CleaningFunctions::normalizePidValue) .map(PidCleaner::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -22,5 +22,11 @@
"paramLongName": "targetPath", "paramLongName": "targetPath",
"paramDescription": "the output path of the graph enriched", "paramDescription": "the output path of the graph enriched",
"paramRequired": true "paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingDir",
"paramDescription": "the working dir",
"paramRequired": true
} }
] ]

View File

@ -47,13 +47,15 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
log.info(s"orcidPath is '$orcidPath'") log.info(s"orcidPath is '$orcidPath'")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
log.info(s"targetPath is '$targetPath'") log.info(s"targetPath is '$targetPath'")
val workingDir = parser.get("workingDir")
log.info(s"targetPath is '$workingDir'")
createTemporaryData(graphPath, orcidPath, targetPath) createTemporaryData(graphPath, orcidPath, workingDir)
analisys(targetPath) analisys(workingDir)
generateGraph(graphPath, targetPath) generateGraph(graphPath, workingDir, targetPath)
} }
private def generateGraph(graphPath: String, targetPath: String): Unit = { private def generateGraph(graphPath: String, workingDir: String, targetPath: String): Unit = {
ModelSupport.entityTypes.asScala ModelSupport.entityTypes.asScala
.filter(e => ModelSupport.isResult(e._1)) .filter(e => ModelSupport.isResult(e._1))
@ -63,7 +65,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String]
val matched = spark.read val matched = spark.read
.schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema) .schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema)
.parquet(s"${targetPath}/${resultType}_matched") .parquet(s"${workingDir}/${resultType}_matched")
.selectExpr("id", "enriched_author") .selectExpr("id", "enriched_author")
spark.read spark.read

View File

@ -73,14 +73,10 @@ public class GraphHiveImporterJobTest {
GraphHiveImporterJob GraphHiveImporterJob
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", "--isSparkSessionManaged", Boolean.FALSE.toString(),
Boolean.FALSE.toString(), "--inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
"-inputPath", "--hiveMetastoreUris", "",
getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), "--hiveDbName", dbName
"-hiveMetastoreUris",
"",
"-hiveDbName",
dbName
}); });
ModelSupport.oafTypes ModelSupport.oafTypes

View File

@ -388,7 +388,7 @@ public class CleanGraphSparkJobTest {
.collect(Collectors.toList()); .collect(Collectors.toList());
assertNotNull(fos_subjects); assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size()); assertEquals(3, fos_subjects.size());
assertTrue( assertTrue(
fos_subjects fos_subjects
@ -396,18 +396,10 @@ public class CleanGraphSparkJobTest {
.anyMatch( .anyMatch(
s -> "0101 mathematics".equals(s.getValue()) & s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) & ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive" "subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
assertTrue( verify_keyword(p, "FOS: Mathematics");
fos_subjects verify_keyword(p, "FOS: Computer and information sciences");
.stream()
.anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
verify_keyword(p, "In Situ Hybridization");
verify_keyword(p, "Avicennia");
} }
@Test @Test

View File

@ -266,7 +266,7 @@ public class GraphCleaningFunctionsTest {
.collect(Collectors.toList()); .collect(Collectors.toList());
assertNotNull(fos_subjects); assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size()); assertEquals(3, fos_subjects.size());
assertTrue( assertTrue(
fos_subjects fos_subjects
@ -274,18 +274,18 @@ public class GraphCleaningFunctionsTest {
.anyMatch( .anyMatch(
s -> "0101 mathematics".equals(s.getValue()) & s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) & ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive" "subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
assertTrue( assertTrue(
fos_subjects fos_subjects
.stream() .stream()
.anyMatch( .anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) & s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
verify_keyword(p_cleaned, "In Situ Hybridization"); verify_keyword(p_cleaned, "FOS: Computer and information sciences");
verify_keyword(p_cleaned, "Avicennia"); verify_keyword(p_cleaned, "FOS: Mathematics");
// TODO add more assertions to verity the cleaned values // TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_cleaned)); System.out.println(MAPPER.writeValueAsString(p_cleaned));

View File

@ -44,7 +44,7 @@ class GenerateEntitiesApplicationTest {
} }
@Test @Test
void testMergeResult() throws IOException, DocumentException { void testMergeResult() throws IOException {
Result publication = getResult("oaf_record.xml", Publication.class); Result publication = getResult("oaf_record.xml", Publication.class);
Result dataset = getResult("odf_dataset.xml", Dataset.class); Result dataset = getResult("odf_dataset.xml", Dataset.class);
Result software = getResult("odf_software.xml", Software.class); Result software = getResult("odf_software.xml", Software.class);
@ -69,15 +69,15 @@ class GenerateEntitiesApplicationTest {
verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID); verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
} }
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz, protected <T extends Result> void verifyMerge(Result r1, Result r2, Class<T> clazz,
String resultType) { String resultType) {
final Result merge = (Result) MergeUtils.merge(publication, dataset); final Result merge = MergeUtils.checkedMerge(r1, r2, true);
assertTrue(clazz.isAssignableFrom(merge.getClass())); assertTrue(clazz.isAssignableFrom(merge.getClass()));
assertEquals(resultType, merge.getResulttype().getClassid()); assertEquals(resultType, merge.getResulttype().getClassid());
} }
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz)
throws IOException, DocumentException { throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName)); final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
return new OdfToOafMapper(vocs, false, true) return new OdfToOafMapper(vocs, false, true)
.processMdRecord(xml) .processMdRecord(xml)

View File

@ -216,7 +216,7 @@ class MappersTest {
} }
@Test @Test
void testPublication_PubMed() throws IOException, DocumentException { void testPublication_PubMed() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record_pubmed.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record_pubmed.xml")));
@ -264,8 +264,17 @@ class MappersTest {
assertFalse(p.getSubject().isEmpty()); assertFalse(p.getSubject().isEmpty());
assertFalse(p.getPid().isEmpty()); assertFalse(p.getPid().isEmpty());
assertEquals("PMC1517292", p.getPid().get(0).getValue());
assertEquals("pmc", p.getPid().get(0).getQualifier().getClassid()); assertTrue(p.getPid().stream().anyMatch(pi -> "pmc".equals(pi.getQualifier().getClassid())));
assertEquals(
"PMC1517292",
p
.getPid()
.stream()
.filter(pi -> "pmc".equals(pi.getQualifier().getClassid()))
.findFirst()
.get()
.getValue());
assertNotNull(p.getInstance()); assertNotNull(p.getInstance());
assertFalse(p.getInstance().isEmpty()); assertFalse(p.getInstance().isEmpty());
@ -292,7 +301,7 @@ class MappersTest {
} }
@Test @Test
void testPublicationInvisible() throws IOException, DocumentException { void testPublicationInvisible() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
@ -307,6 +316,25 @@ class MappersTest {
} }
@Test
void testPublicationInvisible_BASE() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_record_base.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, true, true).processMdRecord(xml);
assertFalse(list.isEmpty());
assertTrue(list.get(0) instanceof Publication);
final Publication p = (Publication) list.get(0);
assertTrue(p.getDataInfo().getInvisible());
System.out.println(new ObjectMapper().writeValueAsString(p));
}
@Test @Test
void testOdfFwfEBookLibrary() throws IOException { void testOdfFwfEBookLibrary() throws IOException {
final String xml = IOUtils final String xml = IOUtils
@ -318,7 +346,7 @@ class MappersTest {
} }
@Test @Test
void testDataset() throws IOException, DocumentException { void testDataset() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_dataset.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_dataset.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -332,19 +360,19 @@ class MappersTest {
final Relation r1 = (Relation) list.get(1); final Relation r1 = (Relation) list.get(1);
final Relation r2 = (Relation) list.get(2); final Relation r2 = (Relation) list.get(2);
assertEquals(d.getId(), r1.getSource()); assertEquals(d.getId(), r1.getTarget());
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r1.getTarget()); assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r1.getSource());
assertEquals(ModelConstants.RESULT_PROJECT, r1.getRelType()); assertEquals(ModelConstants.RESULT_PROJECT, r1.getRelType());
assertEquals(ModelConstants.OUTCOME, r1.getSubRelType()); assertEquals(ModelConstants.OUTCOME, r1.getSubRelType());
assertEquals(ModelConstants.IS_PRODUCED_BY, r1.getRelClass()); assertEquals(ModelConstants.PRODUCES, r1.getRelClass());
assertTrue(r1.getValidated()); assertTrue(r1.getValidated());
assertEquals("2020-01-01", r1.getValidationDate()); assertEquals("2020-01-01", r1.getValidationDate());
assertEquals(d.getId(), r2.getTarget()); assertEquals(d.getId(), r2.getSource());
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r2.getSource()); assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r2.getTarget());
assertEquals(ModelConstants.RESULT_PROJECT, r2.getRelType()); assertEquals(ModelConstants.RESULT_PROJECT, r2.getRelType());
assertEquals(ModelConstants.OUTCOME, r2.getSubRelType()); assertEquals(ModelConstants.OUTCOME, r2.getSubRelType());
assertEquals(ModelConstants.PRODUCES, r2.getRelClass()); assertEquals(ModelConstants.IS_PRODUCED_BY, r2.getRelClass());
assertTrue(r2.getValidated()); assertTrue(r2.getValidated());
assertEquals("2020-01-01", r2.getValidationDate()); assertEquals("2020-01-01", r2.getValidationDate());
@ -378,15 +406,15 @@ class MappersTest {
assertEquals("Baracchini", author.get().getSurname()); assertEquals("Baracchini", author.get().getSurname());
assertEquals("Theo", author.get().getName()); assertEquals("Theo", author.get().getName());
assertEquals(1, author.get().getAffiliation().size()); assertEquals(1, author.get().getRawAffiliationString().size());
final Optional<Field<String>> opAff = author final Optional<String> opAff = author
.get() .get()
.getAffiliation() .getRawAffiliationString()
.stream() .stream()
.findFirst(); .findFirst();
assertTrue(opAff.isPresent()); assertTrue(opAff.isPresent());
final Field<String> affiliation = opAff.get(); final String affiliation = opAff.get();
assertEquals("ISTI-CNR", affiliation.getValue()); assertEquals("ISTI-CNR", affiliation);
assertFalse(d.getSubject().isEmpty()); assertFalse(d.getSubject().isEmpty());
assertFalse(d.getInstance().isEmpty()); assertFalse(d.getInstance().isEmpty());
@ -450,7 +478,7 @@ class MappersTest {
} }
@Test @Test
void testOdfBielefeld() throws IOException, DocumentException { void testOdfBielefeld() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_bielefeld.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_bielefeld.xml")));
@ -501,7 +529,7 @@ class MappersTest {
} }
@Test @Test
void testOpentrial() throws IOException, DocumentException { void testOpentrial() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_opentrial.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_opentrial.xml")));
@ -741,7 +769,7 @@ class MappersTest {
} }
@Test @Test
void testSoftware() throws IOException, DocumentException { void testSoftware() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_software.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_software.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -763,22 +791,21 @@ class MappersTest {
final Relation r1 = (Relation) list.get(1); final Relation r1 = (Relation) list.get(1);
final Relation r2 = (Relation) list.get(2); final Relation r2 = (Relation) list.get(2);
assertEquals(s.getId(), r1.getSource()); assertEquals(s.getId(), r1.getTarget());
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r1.getTarget()); assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r1.getSource());
assertEquals(ModelConstants.RESULT_RESULT, r1.getRelType()); assertEquals(ModelConstants.RESULT_RESULT, r1.getRelType());
assertEquals(ModelConstants.RELATIONSHIP, r1.getSubRelType()); assertEquals(ModelConstants.RELATIONSHIP, r1.getSubRelType());
assertEquals(ModelConstants.IS_REFERENCED_BY, r1.getRelClass()); assertEquals(ModelConstants.REFERENCES, r1.getRelClass());
assertEquals(s.getId(), r2.getTarget()); assertEquals(s.getId(), r2.getSource());
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r2.getSource()); assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r2.getTarget());
assertEquals(ModelConstants.RESULT_RESULT, r2.getRelType()); assertEquals(ModelConstants.RESULT_RESULT, r2.getRelType());
assertEquals(ModelConstants.RELATIONSHIP, r2.getSubRelType()); assertEquals(ModelConstants.RELATIONSHIP, r2.getSubRelType());
assertEquals(ModelConstants.REFERENCES, r2.getRelClass()); assertEquals(ModelConstants.IS_REFERENCED_BY, r2.getRelClass());
} }
@Test @Test
void testClaimDedup() throws IOException, DocumentException { void testClaimDedup() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_dedup.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_dedup.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -792,7 +819,7 @@ class MappersTest {
} }
@Test @Test
void testNakala() throws IOException, DocumentException { void testNakala() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_nakala.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_nakala.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -820,7 +847,7 @@ class MappersTest {
} }
@Test @Test
void testEnermaps() throws IOException, DocumentException { void testEnermaps() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("enermaps.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("enermaps.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -845,7 +872,7 @@ class MappersTest {
} }
@Test @Test
void testClaimFromCrossref() throws IOException, DocumentException { void testClaimFromCrossref() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_crossref.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_crossref.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -862,7 +889,7 @@ class MappersTest {
} }
@Test @Test
void testODFRecord() throws IOException, DocumentException { void testODFRecord() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_record.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_record.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
@ -882,7 +909,7 @@ class MappersTest {
} }
@Test @Test
void testTextGrid() throws IOException, DocumentException { void testTextGrid() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("textgrid.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("textgrid.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -916,7 +943,7 @@ class MappersTest {
} }
@Test @Test
void testBologna() throws IOException, DocumentException { void testBologna() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf-bologna.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf-bologna.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -933,7 +960,7 @@ class MappersTest {
} }
@Test @Test
void testJairo() throws IOException, DocumentException { void testJairo() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_jairo.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_jairo.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -971,7 +998,7 @@ class MappersTest {
} }
@Test @Test
void testZenodo() throws IOException, DocumentException { void testZenodo() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1016,7 +1043,7 @@ class MappersTest {
} }
@Test @Test
void testOdfFromHdfs() throws IOException, DocumentException { void testOdfFromHdfs() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_from_hdfs.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_from_hdfs.xml")));
@ -1065,7 +1092,7 @@ class MappersTest {
} }
@Test @Test
void testXMLEncodedURL() throws IOException, DocumentException { void testXMLEncodedURL() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1081,7 +1108,7 @@ class MappersTest {
} }
@Test @Test
void testXMLEncodedURL_ODF() throws IOException, DocumentException { void testXMLEncodedURL_ODF() throws IOException {
final String xml = IOUtils final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml"))); .toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1245,7 +1272,7 @@ class MappersTest {
} }
@Test @Test
void testRiunet() throws IOException, DocumentException { void testRiunet() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("riunet.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("riunet.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");
@ -1291,7 +1318,7 @@ class MappersTest {
} }
@Test @Test
void testIRISPub() throws IOException, DocumentException { void testIRISPub() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("iris-odf.xml"))); final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("iris-odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************"); System.out.println("***************");

View File

@ -16,6 +16,8 @@ import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.common.RelationInverse;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -364,6 +366,39 @@ class MigrateDbEntitiesApplicationTest {
assertValidId(r1.getCollectedfrom().get(0).getKey()); assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey()); assertValidId(r2.getCollectedfrom().get(0).getKey());
} }
@Test
void testProcessClaims_affiliation() throws Exception {
final List<TypedField> fields = prepareMocks("claimsrel_resultset_affiliation.json");
final List<Oaf> list = app.processClaims(rs);
assertEquals(2, list.size());
verifyMocks(fields);
assertTrue(list.get(0) instanceof Relation);
assertTrue(list.get(1) instanceof Relation);
final Relation r1 = (Relation) list.get(0);
final Relation r2 = (Relation) list.get(1);
assertValidId(r1.getSource());
assertValidId(r1.getTarget());
assertValidId(r2.getSource());
assertValidId(r2.getTarget());
assertNotNull(r1.getDataInfo());
assertNotNull(r2.getDataInfo());
assertNotNull(r1.getDataInfo().getTrust());
assertNotNull(r2.getDataInfo().getTrust());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey());
}
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException { private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {
final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile));

View File

@ -794,28 +794,6 @@
}, },
"value": "FOS: Computer and information sciences" "value": "FOS: Computer and information sciences"
}, },
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "0101 mathematics"
},
{ {
"dataInfo": { "dataInfo": {
"deletedbyinference": false, "deletedbyinference": false,
@ -831,8 +809,8 @@
"trust": "0.9" "trust": "0.9"
}, },
"qualifier": { "qualifier": {
"classid": "keyword", "classid": "FOS",
"classname": "keyword", "classname": "Fields of Science and Technology classification",
"schemeid": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies" "schemename": "dnet:subject_classification_typologies"
}, },
@ -910,8 +888,8 @@
"inferred": false, "inferred": false,
"invisible": false, "invisible": false,
"provenanceaction": { "provenanceaction": {
"classid": "sysimport:actionset", "classid": "subject:fos",
"classname": "Harvested", "classname": "subject:fos",
"schemeid": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions" "schemename": "dnet:provenanceActions"
}, },
@ -923,7 +901,7 @@
"schemeid": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies" "schemename": "dnet:subject_classification_typologies"
}, },
"value": "Avicennia" "value": "0102 computer and information sciences"
}, },
{ {
"dataInfo": { "dataInfo": {

View File

@ -0,0 +1,27 @@
[
{
"field": "source_type",
"type": "string",
"value": "organization"
},
{
"field": "source_id",
"type": "string",
"value": "openorgs____::b5ca9d4340e26454e367e2908ef3872f"
},
{
"field": "target_type",
"type": "string",
"value": "software"
},
{
"field": "target_id",
"type": "string",
"value": "userclaim___::bde53826d07c8cf47c99222a375cd2e8"
},
{
"field": "semantics",
"type": "string",
"value": "resultOrganization_affiliation_isAuthorInstitutionOf"
}
]

View File

@ -0,0 +1,129 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<dri:objIdentifier>base_oa_____::7ecf1ef502253efffe203ca9a22bb9f1</dri:objIdentifier>
<identifier>ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</identifier>
<datestamp>2020-12-22T10:30:27Z</datestamp>
<dr:dateOfTransformation>2024-09-10T17:21:36.972Z</dr:dateOfTransformation>
</header>
<metadata>
<datacite:resource>
<datacite:identifier identifierType="DOI">https://doi.org/10.1016/j.envint.2014.07.004</datacite:identifier>
<datacite:alternateIdentifiers>
<datacite:identifier alternateIdentifierType="url">https://espace.library.uq.edu.au/view/UQ:336902</datacite:identifier>
<datacite:identifier alternateIdentifierType="oai-original">ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</datacite:identifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers/>
<datacite:resourceType>Article contribution</datacite:resourceType>
<datacite:titles>
<datacite:title>The role of environmental factors in the spatial distribution of Japanese encephalitis in mainland China</datacite:title>
</datacite:titles>
<datacite:creators>
<datacite:creator>
<datacite:creatorName>Wang, Liya</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Hu, Wenbiao</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Soares Magalhaes, Ricardo J.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Bi, Peng</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Ding, Fan</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Sun, Hailong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Li, Shenlong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Yin, Wenwu</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Wei, Lan</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Liu, Qiyong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Haque, Ubydul</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Sun, Yansong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Huang, Liuyu</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Tong, Shilu</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Clements, Archie C.A.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Zhang, Wenyi</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Li, Chengyi</datacite:creatorName>
</datacite:creator>
</datacite:creators>
<datacite:contributors/>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">Japanese encephalitis (JE) is the most common cause of viral encephalitis and an important public health concern in the Asia-Pacific region, particularly in China where 50% of global cases are notified. To explore the association between environmental factors and human JE cases and identify the high risk areas for JE transmission in China, we used annual notified data on JE cases at the center of administrative township and environmental variables with a pixel resolution of 1. km. ×. 1. km from 2005 to 2011 to construct models using ecological niche modeling (ENM) approaches based on maximum entropy. These models were then validated by overlaying reported human JE case localities from 2006 to 2012 onto each prediction map. ENMs had good discriminatory ability with the area under the curve (AUC) of the receiver operating curve (ROC) of 0.82-0.91, and low extrinsic omission rate of 5.44-7.42%. Resulting maps showed JE being presented extensively throughout southwestern and central China, with local spatial variations in probability influenced by minimum temperatures, human population density, mean temperatures, and elevation, with contribution of 17.94%-38.37%, 15.47%-21.82%, 3.86%-21.22%, and 12.05%-16.02%, respectively. Approximately 60% of JE cases occurred in predicted high risk areas, which covered less than 6% of areas in mainland China. Our findings will help inform optimal geographical allocation of the limited resources available for JE prevention and control in China, find hidden high-risk areas, and increase the effectiveness of public health interventions against JE transmission.</datacite:description>
</datacite:descriptions>
<datacite:subjects>
<datacite:subject>Japanese encephalitis</datacite:subject>
<datacite:subject>Ecological niche model</datacite:subject>
<datacite:subject>MaxEnt</datacite:subject>
<datacite:subject>China</datacite:subject>
<datacite:subject>2300 Environmental Science</datacite:subject>
<datacite:subject classificationCode="950" subjectScheme="ddc">950</datacite:subject>
</datacite:subjects>
<datacite:publisher>Pergamon Press</datacite:publisher>
<datacite:publicationYear>2014</datacite:publicationYear>
<datacite:formats/>
<datacite:language>eng</datacite:language>
<oaf:accessrights/>
</datacite:resource>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:accessrights>UNKNOWN</oaf:accessrights>
<oaf:identifier identifierType="doi">10.1163/qwerty</oaf:identifier>
<oaf:identifier identifierType="doi">0.1163/18763308-90001038</oaf:identifier>
<oaf:identifier identifierType="doi">https://doi.org/10.1016/j.envint.2014.07.004</oaf:identifier>
<oaf:identifier identifierType="doi">https://doi.org/10.1080/09672567.2013.792375</oaf:identifier>
<oaf:identifier identifierType="doi">http://doi.org/10.1080/08673487.2012.812376</oaf:identifier>
<oaf:identifier identifierType="doi">http://dx.doi.org/10.1090/08673487.2012.812376</oaf:identifier>
<oaf:identifier identifierType="url">https://espace.library.uq.edu.au/view/UQ:336902</oaf:identifier>
<oaf:identifier identifierType="oai-original">ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</oaf:identifier>
<oaf:hostedBy name="The University of Queensland: UQ eSpace" id="opendoar____::575"/>
<oaf:collectedFrom name="Bielefeld Academic Search Engine (BASE)"
id="openaire____::base_search"/>
<oaf:dateAccepted>2014-12-01</oaf:dateAccepted>
<oaf:relation relClass="hasAuthorInstitution"
relType="resultOrganization"
subRelType="affiliation"
targetType="organization">ror_________::https://ror.org/00rqy9422</oaf:relation>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.89</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:aggregator"
classname="sysimport:crosswalk:aggregator"
schemeid="dnet:provenanceActions"
schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</metadata>
</record>

View File

@ -130,5 +130,10 @@
"value": [ "value": [
"Pippo", "Foo" "Pippo", "Foo"
] ]
},
{
"field": "typology",
"type": "string",
"value": "Government"
} }
] ]

View File

@ -31,5 +31,11 @@ class ORCIDAuthorMatchersTest {
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin")) assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
} }
@Test def testDocumentationNames(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
}
@Test def testDocumentationNames2(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller Jones"))
}
} }

View File

@ -4,12 +4,13 @@ import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Test import org.junit.jupiter.api.{Disabled, Test}
import org.objenesis.strategy.StdInstantiatorStrategy import org.objenesis.strategy.StdInstantiatorStrategy
class ScholixGenerationTest { class ScholixGenerationTest {
@Test @Test
@Disabled
def generateScholix(): Unit = { def generateScholix(): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate() val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()

View File

@ -175,7 +175,8 @@ public class XmlSerializationUtils {
.append("<") .append("<")
.append(name) .append(name)
.append(" ") .append(" ")
.append(attr(measure.getId(), kv.getValue())) .append(attr("id", measure.getId()))
.append(attr("score", kv.getValue()))
.append(attr("datasource", kv.getKey())) .append(attr("datasource", kv.getKey()))
.append(" />"); .append(" />");
} }

View File

@ -8,6 +8,26 @@
} }
], ],
"measures": [ "measures": [
{
"id": "views",
"unit": [
{
"key": "opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO",
"value": "5",
"dataInfo": null
}
]
},
{
"id": "downloads",
"unit": [
{
"key": "opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO",
"value": "2",
"dataInfo": null
}
]
},
{ {
"id": "influence", "id": "influence",
"unit": [ "unit": [

View File

@ -1,63 +0,0 @@
#/usr/bin/bash
# Read log files from ranking scripts and create a two-line file
# with score limits for the various measures. To be used by Kleanthis
attrank_file=$(ls *attrank*.log);
pr_file=$(ls *pagerank*.log)
ram_file=$(ls *ram*.log);
cc_file=$(ls *cc*.log);
impulse_file=$(ls *impulse*.log);
echo
echo "-----------------------------"
echo "Attrank file:${attrank_file}";
echo "PageRank file:${pr_file}";
echo "RAM file:${ram_file}";
echo "CC file:${cc_file}";
echo "Impulse file:${impulse_file}";
echo "-----------------------------"
echo
echo
# output file will be called score_limits.csv
echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv
# ---------------------------------------------------- #
# Get respective score limits (we don't need RAM)
inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2);
inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2);
inf_1=$(grep "^1%" ${pr_file} | cut -f 2);
inf_10=$(grep "^10%" ${pr_file} | cut -f 2);
echo "Influnence limits:"
echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}";
# ---------------------------------------------------- #
pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2);
pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2);
pop_1=$(grep "^1%" ${attrank_file} | cut -f 2);
pop_10=$(grep "^10%" ${attrank_file} | cut -f 2);
echo "Popularity limits:";
echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}";
# ---------------------------------------------------- #
imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2);
imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2);
imp_1=$(grep "^1%" ${impulse_file} | cut -f 2);
imp_10=$(grep "^10%" ${impulse_file} | cut -f 2);
echo "Popularity limits:";
echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}";
# ---------------------------------------------------- #
cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2);
cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2);
cc_1=$(grep "^1%" ${cc_file} | cut -f 2);
cc_10=$(grep "^10%" ${cc_file} | cut -f 2);
echo "Popularity limits:";
echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}";
# ---------------------------------------------------- #
echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv
echo
echo "score_limits.csv contents:"
cat score_limits.csv
echo;
echo;

View File

@ -1,60 +0,0 @@
import json
import sys
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
if len(sys.argv) != 3:
print("Usage: map_openaire_ids_to_dois.py <hdfs_src_dir> <hdfs_output_dir>")
sys.exit(-1)
conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs')
sc = SparkContext(conf = conf)
spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate()
sc.setLogLevel('OFF')
src_dir = sys.argv[1]
output = sys.argv[2]
# src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/"
# output = '/tmp/openaireid_to_dois/'
def transform(doc):
# get publication year from 'doc.dateofacceptance.value'
dateofacceptance = doc.get('dateofacceptance', {}).get('value')
year = 0
if (dateofacceptance is not None):
year = dateofacceptance.split('-')[0]
# for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi'
dois = [ pid['value'] for pid in doc.get('pid', []) if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)]
num_dois = len(dois)
# exlcude openaire ids that do not correspond to DOIs
if (num_dois == 0):
return None
fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ]
return '\t'.join([ v.encode('utf-8') for v in fields ])
docs = None
for result_type in ["publication", "dataset", "software", "otherresearchproduct"]:
tmp = sc.textFile(src_dir + result_type).map(json.loads)
if (docs is None):
docs = tmp
else:
# append all result types in one RDD
docs = docs.union(tmp)
docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False)
docs = docs.map(transform).filter(lambda d: d is not None)
docs.saveAsTextFile(output)

View File

@ -1,168 +0,0 @@
#!/usr/bin/python
# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
# and uses this mapping to create doi-based score files in the format required by BiP! DB.
# This is done by reading each openaire-id based ranking file and joining the openaire based
# score and classes to all the corresponding dois.
#################################################################################################
# Imports
import sys
# Sparksession lib to communicate with cluster via session object
from pyspark.sql import SparkSession
# Import sql types to define schemas
from pyspark.sql.types import *
# Import sql functions with shorthand alias
import pyspark.sql.functions as F
from pyspark.sql.functions import max
# from pyspark.sql.functions import udf
#################################################################################################
#################################################################################################
# Clean up directory name - no longer needed in final workflow version
'''
def clean_directory_name(dir_name):
# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_*
# and we need to keep the parts in *
dir_name_parts = dir_name.split('_')
dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
clean_name = dir_name + ".txt.gz"
# clean_name = '_'.join(dir_name_parts)
# if '_ids' not in clean_name:
# clean_name = clean_name.replace('id_', 'ids_')
# clean_name = clean_name.replace('.txt', '')
# clean_name = clean_name.replace('.gz', '')
# if 'openaire_ids_' in clean_name:
# clean_name = clean_name.replace('openaire_ids_', '')
# clean_name = clean_name + '.txt.gz'
# else:
# clean_name = clean_name + '.txt.gz'
return clean_name
'''
#################################################################################################
if len(sys.argv) < 3:
print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
sys.exit(-1)
# Read arguments
synonyms_folder = sys.argv[1]
num_partitions = int(sys.argv[2])
input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
# input_file_list = [clean_directory_name(item) for item in input_file_list]
# Prepare output specific variables
output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
# --- INFO MESSAGES --- #
print ("\n\n----------------------------")
print ("Mpping openaire ids to DOIs")
print ("Reading input from: " + synonyms_folder)
print ("Num partitions: " + str(num_partitions))
print ("Input files:" + " -- ".join(input_file_list))
print ("Output files: " + " -- ".join(output_file_list))
print ("----------------------------\n\n")
#######################################################################################
# We weill define the following schemas:
# --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character)
# --> a schema for floating point ranking scores [string - float - string] (the latter string is the class)
# --> a schema for integer ranking scores [string - int - string] (the latter string is the class)
float_schema = StructType([
StructField('id', StringType(), False),
StructField('score', FloatType(), False),
StructField('class', StringType(), False)
])
int_schema = StructType([
StructField('id', StringType(), False),
StructField('score', IntegerType(), False),
StructField('class', StringType(), False)
])
# This schema concerns the output of the file
# containing the number of references of each doi
synonyms_schema = StructType([
StructField('id', StringType(), False),
StructField('num_synonyms', IntegerType(), False),
StructField('doi_list', StringType(), False),
])
#######################################################################################
# Start spark session
spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate()
# Set Log Level for spark session
spark.sparkContext.setLogLevel('WARN')
#######################################################################################
# MAIN Program
# Read and repartition the synonym folder - also cache it since we will need to perform multiple joins
synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder)
synonym_df = synonym_df.select('id', F.split(F.col('doi_list'), chr(0x02)).alias('doi_list'))
synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache()
# TESTING
# print ("Synonyms: " + str(synonym_df.count()))
# print ("DF looks like this:" )
# synonym_df.show(1000, False)
print ("\n\n-----------------------------")
# Now we need to join the score files on the openaire-id with the synonyms and then keep
# only doi - score - class and write this to the output
for offset, input_file in enumerate(input_file_list):
print ("Mapping scores from " + input_file)
# Select correct schema
schema = int_schema
if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower():
schema = float_schema
# Load file to dataframe
ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id')
# Get max score
max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max']
print ("Max Score for " + str(input_file) + " is " + str(max_score))
# TESTING
# print ("Loaded df sample:")
# ranking_df.show(1000, False)
# Join scores to synonyms and keep required fields
doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache()
# Write output
output_file = output_file_list[offset]
print ("Writing to: " + output_file)
doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
# Creata another file for the bip update process
ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup'))
doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache()
output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz")
print ("Writing bip update to: " + output_file)
doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
# Free memory?
ranking_df.unpersist(True)
print ("-----------------------------")
print ("\n\nFinished!\n\n")

View File

@ -17,10 +17,6 @@
<name>openaireGraphInputPath</name> <name>openaireGraphInputPath</name>
<value>${nameNode}/${workingDir}/openaire_id_graph</value> <value>${nameNode}/${workingDir}/openaire_id_graph</value>
</property> </property>
<property>
<name>synonymFolder</name>
<value>${nameNode}/${workingDir}/openaireid_to_dois/</value>
</property>
<property> <property>
<name>checkpointDir</name> <name>checkpointDir</name>
<value>${nameNode}/${workingDir}/check/</value> <value>${nameNode}/${workingDir}/check/</value>
@ -32,29 +28,34 @@
</configuration> </configuration>
</global> </global>
<!-- start using a decision node, so as to determine from which point onwards a job will continue --> <!-- Start using a decision node, to determine from which point onwards a job will continue -->
<start to="entry-point-decision" /> <start to="entry-point-decision" />
<decision name="entry-point-decision"> <decision name="entry-point-decision">
<switch> <switch>
<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
<!-- If any different condition is set, go to the corresponding start --> <!-- Start from creating the citation network (i.e., normal execution should start from here) -->
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
<!-- Different citation-based impact indicators are computed -->
<case to="spark-cc">${wf:conf('resume') eq "cc"}</case> <case to="spark-cc">${wf:conf('resume') eq "cc"}</case>
<case to="spark-ram">${wf:conf('resume') eq "ram"}</case> <case to="spark-ram">${wf:conf('resume') eq "ram"}</case>
<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case> <case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case> <case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case> <case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
<!-- <case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case> -->
<!-- Format the results appropriately before transforming them to action sets -->
<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case> <case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
<!-- Aggregation of impact scores on the project level --> <!-- Aggregation of impact scores on the project level -->
<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case> <case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
<!-- Create action sets -->
<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case> <case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
<!-- The default will be set as the normal start, a.k.a. create-openaire-ranking-graph -->
<default to="create-openaire-ranking-graph" /> <default to="create-openaire-ranking-graph" />
</switch> </switch>
</decision> </decision>
@ -295,18 +296,11 @@
<capture-output/> <capture-output/>
</shell> </shell>
<ok to="format-result-files" /> <ok to="format-json-files" />
<error to="filename-getting-error" /> <error to="filename-getting-error" />
</action> </action>
<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
<fork name="format-result-files">
<path start="format-bip-files"/>
<path start="format-json-files"/>
</fork>
<!-- Format json files --> <!-- Format json files -->
<!-- Two parts: a) format files b) make the file endings .json.gz --> <!-- Two parts: a) format files b) make the file endings .json.gz -->
<action name="format-json-files"> <action name="format-json-files">
@ -345,139 +339,8 @@
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file> <file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
</spark> </spark>
<ok to="join-file-formatting" />
<error to="json-formatting-fail" />
</action>
<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
<action name="format-bip-files">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<!-- This is the name of our job -->
<name>Format Ranking Results BiP! DB</name>
<!-- Script name goes here -->
<jar>format_ranking_results.py</jar>
<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
<spark-opts>
--executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here -->
<arg>zenodo</arg>
<!-- Input files must be identified dynamically -->
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<!-- Num partitions -->
<arg>${sparkShufflePartitions}</arg>
<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
<arg>openaire</arg>
<!-- This needs to point to the file on the hdfs i think -->
<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
</spark>
<ok to="join-file-formatting" />
<error to="bip-formatting-fail" />
</action>
<!-- Finish formatting jobs -->
<join name="join-file-formatting" to="map-openaire-to-doi"/>
<!-- maps openaire ids to DOIs -->
<action name="map-openaire-to-doi">
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- Delete previously created doi synonym folder -->
<prepare>
<delete path="${synonymFolder}"/>
</prepare>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Openaire-DOI synonym collection</name>
<jar>map_openaire_ids_to_dois.py</jar>
<spark-opts>
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here -->
<arg>${openaireDataInput}/</arg>
<!-- number of partitions to be used on joins -->
<arg>${synonymFolder}</arg>
<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
</spark>
<ok to="map-scores-to-dois" />
<error to="synonym-collection-fail" />
</action>
<!-- mapping openaire scores to DOIs -->
<action name="map-scores-to-dois">
<!-- This is required as a tag for spark jobs, regardless of programming language -->
<spark xmlns="uri:oozie:spark-action:0.2">
<!-- using configs from an example on openaire -->
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Mapping Openaire Scores to DOIs</name>
<jar>map_scores_to_dois.py</jar>
<spark-opts>
--executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<!-- Script arguments here -->
<arg>${synonymFolder}</arg>
<!-- Number of partitions -->
<arg>${sparkShufflePartitions}</arg>
<!-- The remaining input are the ranking files fproduced for bip db-->
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
</spark>
<ok to="project-impact-indicators" /> <ok to="project-impact-indicators" />
<error to="map-scores-fail" /> <error to="json-formatting-fail" />
</action> </action>
<action name="project-impact-indicators"> <action name="project-impact-indicators">
@ -594,18 +457,6 @@
<message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<kill name="bip-formatting-fail">
<message>Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="synonym-collection-fail">
<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="map-scores-fail">
<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<kill name="actionset-delete-fail"> <kill name="actionset-delete-fail">
<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>

View File

@ -32,7 +32,7 @@ select distinct * from (
from SOURCE.result r from SOURCE.result r
join SOURCE.result_projects rp on rp.id=r.id join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project join SOURCE.project p on p.id=rp.project
join openaire_prod_stats_monitor_ie_20231226b.irish_funders irf on irf.funder=p.funder join TARGET.irish_funders irf on irf.funder=p.funder
union all union all
select r.* select r.*
from SOURCE.result r from SOURCE.result r

View File

@ -1,79 +1,3 @@
--drop database if exists TARGET cascade;
--create database if not exists TARGET;
--
--create view if not exists TARGET.category as select * from SOURCE.category;
--create view if not exists TARGET.concept as select * from SOURCE.concept;
--create view if not exists TARGET.context as select * from SOURCE.context;
--create view if not exists TARGET.country as select * from SOURCE.country;
--create view if not exists TARGET.countrygdp as select * from SOURCE.countrygdp;
--create view if not exists TARGET.creation_date as select * from SOURCE.creation_date;
--create view if not exists TARGET.funder as select * from SOURCE.funder;
--create view if not exists TARGET.fundref as select * from SOURCE.fundref;
--create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
--create view if not exists TARGET.rndgdpexpenditure as select * from SOURCE.rndgdpexpenditure;
--create view if not exists TARGET.doctoratestudents as select * from SOURCE.doctoratestudents;
--create view if not exists TARGET.totalresearchers as select * from SOURCE.totalresearchers;
--create view if not exists TARGET.totalresearchersft as select * from SOURCE.totalresearchersft;
--create view if not exists TARGET.hrrst as select * from SOURCE.hrrst;
--
--create table TARGET.result stored as parquet as
-- select distinct * from (
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
-- union all
-- select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
-- 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
-- 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
-- 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
-- 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
-- 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
-- 'openorgs____::0ae431b820e4c33db8967fbb2b919150', --University of Helsinki
-- 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
-- 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
-- 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
-- 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
-- -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
-- 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-- 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
-- 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
-- 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
-- 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
-- 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
-- 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
-- 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
-- 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
-- 'openorgs____::6445d7758d3a40c4d997953b6632a368', --National Institute of Informatics (NII)
-- 'openorgs____::b77c01aa15de3675da34277d48de2ec1', -- Valencia Catholic University Saint Vincent Martyr
-- 'openorgs____::7fe2f66cdc43983c6b24816bfe9cf6a0', -- Unviersity of Warsaw
-- 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly
-- 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete
-- 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus
-- 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras
-- 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
-- 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
-- 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
-- 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
-- 'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
-- 'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb', -- McMaster University
-- 'openorgs____::51c7fc556e46381734a25a6fbc3fd398', -- University of Modena and Reggio Emilia
-- 'openorgs____::235d7f9ad18ecd7e6dc62ea4990cb9db', -- Bilkent University
-- 'openorgs____::31f2fa9e05b49d4cf40a19c3fed8eb06', -- Saints Cyril and Methodius University of Skopje
-- 'openorgs____::db7686f30f22cbe73a4fde872ce812a6', -- University of Milan
-- 'openorgs____::b8b8ca674452579f3f593d9f5e557483', -- University College Cork
-- 'openorgs____::38d7097854736583dde879d12dacafca' -- Brown University
-- 'openorgs____::57784c9e047e826fefdb1ef816120d92', --Arts et Métiers ParisTech
-- 'openorgs____::2530baca8a15936ba2e3297f2bce2e7e', -- University of Cape Town
-- 'openorgs____::d11f981828c485cd23d93f7f24f24db1', -- Technological University Dublin
-- 'openorgs____::5e6bf8962665cdd040341171e5c631d8', -- Delft University of Technology
-- 'openorgs____::846cb428d3f52a445f7275561a7beb5d', -- University of Manitoba
-- 'openorgs____::eb391317ed0dc684aa81ac16265de041', -- Universitat Rovira i Virgili
-- 'openorgs____::66aa9fc2fceb271423dfabcc38752dc0', -- Lund University
-- 'openorgs____::3cff625a4370d51e08624cc586138b2f' -- IMT Atlantique
-- ) )) foo;
--
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS;
create view if not exists TARGET.category as select * from SOURCE.category; create view if not exists TARGET.category as select * from SOURCE.category;
create view if not exists TARGET.concept as select * from SOURCE.concept; create view if not exists TARGET.concept as select * from SOURCE.concept;
create view if not exists TARGET.context as select * from SOURCE.context; create view if not exists TARGET.context as select * from SOURCE.context;

View File

@ -81,7 +81,17 @@ create table TARGET.result stored as parquet as
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
))) foo; ))) foo;
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -61,7 +61,17 @@ create table TARGET.result stored as parquet as
'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development 'openorgs____::8839b55dae0c84d56fd533f52d5d483a', -- Leibniz Institute of Ecological Urban and Regional Development
'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology 'openorgs____::526468206bca24c1c90da6a312295cf4', -- Cyprus University of Technology
'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna 'openorgs____::b5ca9d4340e26454e367e2908ef3872f', -- Alma Mater Studiorum University of Bologna
'openorgs____::a6340e6ecf60f6bba163659df985b0f2' -- TU Dresden 'openorgs____::a6340e6ecf60f6bba163659df985b0f2', -- TU Dresden
'openorgs____::64badd35233ba2cd4946368ef2f4cf57', -- University of Vienna
'openorgs____::7501d66d2297a963ebfb075c43fff88e', -- Royal Institute of Technology
'openorgs____::d5eb679abdd31f70fcd4c8ba711148bf', -- Sorbonne University
'openorgs____::b316f25380d106aac402f5ae8653910d', -- Centre for Research on Ecology and Forestry Applications
'openorgs____::45a2076eee3013e0e85625ce61bcd272', -- Institut d'Investigació Sanitària Illes Balears
'openorgs____::00b20b0a743a96169e6cf135e6e2bd7c', -- Universidad Publica De Navarra
'openorgs____::0f398605c2459294d125ff23473a97dc', -- Aalto University
'openorgs____::25b1fa62c7fd8e409d3a83c07e04b2d4', -- WHU-Otto Beisheim School of Management
'openorgs____::d6eec313417f11205db4e736a34c0db6', -- KEMPELENOV INSTITUT INTELIGENTNYCH TECHNOLOGII
'openorgs____::c2dfb90e797a2dc52f0084c549289d0c' -- National Research Institute for Agriculture, Food and Environment
))) foo; ))) foo;
--ANALYZE TABLE TARGET.result COMPUTE STATISTICS; --ANALYZE TABLE TARGET.result COMPUTE STATISTICS;

View File

@ -0,0 +1,18 @@
# Install the whole "dnet-hadoop" project.
# Delete this module's previous build-files in order to avoid any conflicts.
rm -rf target/ ||
# Go to the root directory of this project.
cd ../../
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Install the project.
mvn clean install -U ${CHOSEN_MAVEN_PROFILE} -Dmaven.test.skip=true
# We skip tests for all modules, since the take a big amount of time and some of them fail.
# Any test added to this module, will be executed in the "runOozieWorkflow.sh" script.

View File

@ -0,0 +1,20 @@
# This script deploys and runs the oozie workflow on the cluster, defined in the "~/.dhp/application.properties" file.
# Select the build profile.
DEFAULT_PROFILE='' # It's the empty profile.
NEWER_VERSIONS_PROFILE='-Pscala-2.12'
CHOSEN_MAVEN_PROFILE=${DEFAULT_PROFILE}
# Build and deploy this module.
mvn clean package -U ${CHOSEN_MAVEN_PROFILE} -Poozie-package,deploy,run \
-Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/stats
# Show the Oozie-job-ID.
echo -e "\n\nShowing the contents of \"extract-and-run-on-remote-host.log\":\n"
cat ./target/extract-and-run-on-remote-host.log
# Check oozie workflow status
# oozie job -oozie http://iis-cdh5-test-m3:11000/oozie -info <workflow-ID>
# Get the <job-ID> from the previous output and check the logs:
# yarn logs -applicationId application_<job-ID>

View File

@ -1,8 +1,10 @@
set mapred.job.queue.name=analytics; /*EOS*/
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
-- Stats database creation -- Stats database creation
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
DROP database IF EXISTS ${stats_db_name} CASCADE; DROP database IF EXISTS ${stats_db_name} CASCADE; /*EOS*/
CREATE database ${stats_db_name}; CREATE database ${stats_db_name}; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
-- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture) -- Tables/views from external tables/views (Fundref, Country, CountyGDP, roarmap, rndexpediture)
@ -5,27 +7,27 @@
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS CREATE OR REPLACE VIEW ${stats_db_name}.fundref AS
SELECT * SELECT *
FROM ${external_stats_db_name}.fundref; FROM ${external_stats_db_name}.fundref; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.country AS CREATE OR REPLACE VIEW ${stats_db_name}.country AS
SELECT * SELECT *
FROM ${external_stats_db_name}.country; FROM ${external_stats_db_name}.country; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS CREATE OR REPLACE VIEW ${stats_db_name}.countrygdp AS
SELECT * SELECT *
FROM ${external_stats_db_name}.countrygdp; FROM ${external_stats_db_name}.countrygdp; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS CREATE OR REPLACE VIEW ${stats_db_name}.roarmap AS
SELECT * SELECT *
FROM ${external_stats_db_name}.roarmap; FROM ${external_stats_db_name}.roarmap; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS CREATE OR REPLACE VIEW ${stats_db_name}.rndexpediture AS
SELECT * SELECT *
FROM ${external_stats_db_name}.rndexpediture; FROM ${external_stats_db_name}.rndexpediture; /*EOS*/
CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS CREATE OR REPLACE VIEW ${stats_db_name}.licenses_normalized AS
SELECT * SELECT *
FROM ${external_stats_db_name}.licenses_normalized; FROM ${external_stats_db_name}.licenses_normalized; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
@ -33,23 +35,23 @@ FROM ${external_stats_db_name}.licenses_normalized;
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
create or replace view ${stats_db_name}.usage_stats as create or replace view ${stats_db_name}.usage_stats as
select * from openaire_prod_usage_stats.usage_stats; select * from openaire_prod_usage_stats.usage_stats; /*EOS*/
create or replace view ${stats_db_name}.downloads_stats as create or replace view ${stats_db_name}.downloads_stats as
select * from openaire_prod_usage_stats.downloads_stats; select * from openaire_prod_usage_stats.downloads_stats; /*EOS*/
create or replace view ${stats_db_name}.pageviews_stats as create or replace view ${stats_db_name}.pageviews_stats as
select * from openaire_prod_usage_stats.pageviews_stats; select * from openaire_prod_usage_stats.pageviews_stats; /*EOS*/
create or replace view ${stats_db_name}.views_stats as create or replace view ${stats_db_name}.views_stats as
select * from openaire_prod_usage_stats.views_stats; select * from openaire_prod_usage_stats.views_stats; /*EOS*/
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
-- Creation date of the database -- Creation date of the database
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; DROP TABLE IF EXISTS ${stats_db_name}.creation_date purge; /*EOS*/
create table ${stats_db_name}.creation_date STORED AS PARQUET as create table ${stats_db_name}.creation_date STORED AS PARQUET as
select date_format(current_date(), 'dd-MM-yyyy') as date; select date_format(current_date(), 'dd-MM-yyyy') as date; /*EOS*/

View File

@ -1,110 +1,11 @@
set mapred.job.queue.name=analytics; /*EOS*/
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
-- Post processing - Updates on main tables -- Post processing - Updates on main tables
---------------------------------------------------------------- ----------------------------------------------------------------
---------------------------------------------------------------- ----------------------------------------------------------------
--Datasource temporary table updates
UPDATE ${stats_db_name}.datasource_tmp
SET harvested='true'
WHERE datasource_tmp.id IN (SELECT DISTINCT d.id
FROM ${stats_db_name}.datasource_tmp d,
${stats_db_name}.result_datasources rd
WHERE d.id = rd.datasource);
-- Project temporary table update and final project table creation with final updates that can not be applied to ORC tables
UPDATE ${stats_db_name}.project_tmp
SET haspubs='yes'
WHERE project_tmp.id IN (SELECT pr.id
FROM ${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pr.result = r.id
AND r.type = 'publication');
DROP TABLE IF EXISTS ${stats_db_name}.stored purge;
CREATE TABLE ${stats_db_name}.project stored as parquet as
SELECT p.id,
p.acronym,
p.title,
p.funder,
p.funding_lvl0,
p.funding_lvl1,
p.funding_lvl2,
p.ec39,
p.type,
p.startdate,
p.enddate,
p.start_year,
p.end_year,
p.duration,
CASE WHEN prr1.id IS NULL THEN 'no' ELSE 'yes' END AS haspubs,
CASE WHEN prr1.id IS NULL THEN 0 ELSE prr1.np END AS numpubs,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.daysForlastPub END AS daysforlastpub,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
p.callidentifier,
p.code,
p.totalcost,
p.fundedamount,
p.currency
FROM ${stats_db_name}.project_tmp p
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
FROM ${stats_db_name}.project_results pr
INNER JOIN ${stats_db_name}.result r ON pr.result = r.id
WHERE r.type = 'publication'
GROUP BY pr.id) AS prr1 on prr1.id = p.id
LEFT JOIN (SELECT pp.id,
max(datediff(to_date(r.date), to_date(pp.enddate))) AS daysForlastPub,
count(distinct r.id) AS dp
FROM ${stats_db_name}.project_tmp pp,
${stats_db_name}.project_results pr,
${stats_db_name}.result r
WHERE pp.id = pr.id
AND pr.result = r.id
AND r.type = 'publication'
AND datediff(to_date(r.date), to_date(pp.enddate)) > 0
GROUP BY pp.id) AS prr2
ON prr2.id = p.id;
UPDATE ${stats_db_name}.publication_tmp
SET delayed = 'yes'
WHERE publication_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.dataset_tmp
SET delayed = 'yes'
WHERE dataset_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.software_tmp
SET delayed = 'yes'
WHERE software_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
UPDATE ${stats_db_name}.otherresearchproduct_tmp
SET delayed = 'yes'
WHERE otherresearchproduct_tmp.id IN (SELECT distinct r.id
FROM ${stats_db_name}.result r,
${stats_db_name}.project_results pr,
${stats_db_name}.project_tmp p
WHERE r.id = pr.result
AND pr.id = p.id
AND to_date(r.date) - to_date(p.enddate) > 0);
CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS CREATE OR REPLACE VIEW ${stats_db_name}.project_results_publication AS
SELECT result_projects.id AS result, SELECT result_projects.id AS result,
result_projects.project AS project_results, result_projects.project AS project_results,
@ -116,4 +17,4 @@ FROM ${stats_db_name}.result_projects,
${stats_db_name}.project ${stats_db_name}.project
WHERE result_projects.id = result.id WHERE result_projects.id = result.id
AND result.type = 'publication' AND result.type = 'publication'
AND project.id = result_projects.project; AND project.id = result_projects.project; /*EOS*/

View File

@ -1,42 +1,4 @@
------------------------------------------------------------------------------------------------------ set mapred.job.queue.name=analytics; /*EOS*/
-- Creating parquet tables from the updated temporary tables and removing unnecessary temporary tables
------------------------------------------------------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.datasource purge;
CREATE TABLE ${stats_db_name}.datasource stored AS parquet AS
SELECT *
FROM ${stats_db_name}.datasource_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.publication purge;
CREATE TABLE ${stats_db_name}.publication stored AS parquet AS
SELECT *
FROM ${stats_db_name}.publication_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.dataset purge;
CREATE TABLE ${stats_db_name}.dataset stored AS parquet AS
SELECT *
FROM ${stats_db_name}.dataset_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.software purge;
CREATE TABLE ${stats_db_name}.software stored AS parquet AS
SELECT *
FROM ${stats_db_name}.software_tmp;
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct purge;
CREATE TABLE ${stats_db_name}.otherresearchproduct stored AS parquet AS
SELECT *
FROM ${stats_db_name}.otherresearchproduct_tmp;
DROP TABLE ${stats_db_name}.project_tmp;
DROP TABLE ${stats_db_name}.datasource_tmp;
DROP TABLE ${stats_db_name}.publication_tmp;
DROP TABLE ${stats_db_name}.dataset_tmp;
DROP TABLE ${stats_db_name}.software_tmp;
DROP TABLE ${stats_db_name}.otherresearchproduct_tmp;
---------------------------------------------- ----------------------------------------------
-- Re-creating views from final parquet tables -- Re-creating views from final parquet tables
@ -54,4 +16,4 @@ SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.dataset FROM ${stats_db_name}.dataset
UNION ALL UNION ALL
SELECT *, bestlicence AS access_mode SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct; FROM ${stats_db_name}.otherresearchproduct; /*EOS*/

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations
@ -5,10 +7,10 @@
-- Sources related tables/views -- Sources related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.publication p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -16,12 +18,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.dataset p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -29,12 +31,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.software_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.software p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -42,12 +44,12 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_sources STORED AS PARQUET as
SELECT p.id, case when d.id is null then 'other' else p.datasource end as datasource SELECT /*+ COALESCE(100) */ p.id, case when d.id is null then 'other' else p.datasource end as datasource
FROM ( FROM (
SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource SELECT substr(p.id, 4) as id, substr(datasource, 4) as datasource
from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p from ${openaire_db_name}.otherresearchproduct p lateral view explode(p.collectedfrom.key) c as datasource) p
@ -55,7 +57,7 @@ LEFT OUTER JOIN
( (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on p.datasource = d.id; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_sources AS
SELECT * FROM ${stats_db_name}.publication_sources SELECT * FROM ${stats_db_name}.publication_sources
@ -64,24 +66,24 @@ SELECT * FROM ${stats_db_name}.dataset_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_sources SELECT * FROM ${stats_db_name}.software_sources
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; SELECT * FROM ${stats_db_name}.otherresearchproduct_sources; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; DROP TABLE IF EXISTS ${stats_db_name}.result_orcid purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
select distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid select /*+ COALESCE(100) */ distinct res.id, upper(regexp_replace(res.orcid, 'http://orcid.org/' ,'')) as orcid
from ( from (
SELECT substr(res.id, 4) as id, auth_pid.value as orcid SELECT substr(res.id, 4) as id, auth_pid.value as orcid
FROM ${openaire_db_name}.result res FROM ${openaire_db_name}.result res
LATERAL VIEW explode(author) a as auth LATERAL VIEW explode(author) a as auth
LATERAL VIEW explode(auth.pid) ap as auth_pid LATERAL VIEW explode(auth.pid) ap as auth_pid
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; DROP TABLE IF EXISTS ${stats_db_name}.result_result purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype select /*+ COALESCE(100) */ substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -91,12 +93,12 @@ where reltype='resultResult'
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_citations_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations select /*+ COALESCE(100) */ substr(target, 4) as id, count(distinct substr(source, 4)) as citations
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -108,12 +110,12 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4); group by substr(target, 4); /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_references_oc purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
select substr(source, 4) as id, count(distinct substr(target, 4)) as references select /*+ COALESCE(100) */ substr(source, 4) as id, count(distinct substr(target, 4)) as references
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target join ${openaire_db_name}.result r2 on r2.id=rel.target
@ -125,4 +127,4 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(source, 4); group by substr(source, 4); /*EOS*/

View File

@ -1,4 +1,5 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations
@ -6,33 +7,33 @@ set mapred.job.queue.name=analytics;
-- Licences related tables/views -- Licences related tables/views
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.publication p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.dataset p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.software_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.software p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_licenses purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_licenses STORED AS PARQUET AS
SELECT substr(p.id, 4) as id, licenses.value as type SELECT /*+ COALESCE(100) */ substr(p.id, 4) as id, licenses.value as type
from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses from ${openaire_db_name}.otherresearchproduct p LATERAL VIEW explode(p.instance.license) instances as licenses
where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; where licenses.value is not null and licenses.value != '' and p.datainfo.deletedbyinference=false and p.datainfo.invisible = FALSE; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_licenses AS
SELECT * FROM ${stats_db_name}.publication_licenses SELECT * FROM ${stats_db_name}.publication_licenses
@ -41,29 +42,29 @@ SELECT * FROM ${stats_db_name}.dataset_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.software_licenses SELECT * FROM ${stats_db_name}.software_licenses
UNION ALL UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; DROP TABLE IF EXISTS ${stats_db_name}.organization_pids purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_pids STORED AS PARQUET AS
select substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid select /*+ COALESCE(100) */ substr(o.id, 4) as id, ppid.qualifier.classname as type, ppid.value as pid
from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; from ${openaire_db_name}.organization o lateral view explode(o.pid) pids as ppid; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; DROP TABLE IF EXISTS ${stats_db_name}.organization_sources purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.organization_sources STORED AS PARQUET as
SELECT o.id, case when d.id is null then 'other' else o.datasource end as datasource SELECT /*+ COALESCE(100) */ o.id, case when d.id is null then 'other' else o.datasource end as datasource
FROM ( FROM (
SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource SELECT substr(o.id, 4) as id, substr(instances.instance.key, 4) as datasource
from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o from ${openaire_db_name}.organization o lateral view explode(o.collectedfrom) instances as instance) o
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT substr(d.id, 4) id SELECT substr(d.id, 4) id
from ${openaire_db_name}.datasource d from ${openaire_db_name}.datasource d
WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; WHERE d.datainfo.deletedbyinference=false and d.datainfo.invisible = FALSE) d on o.datasource = d.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; DROP TABLE IF EXISTS ${stats_db_name}.result_accessroute purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_accessroute STORED AS PARQUET as
select distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result select /*+ COALESCE(100) */ distinct substr(id,4) as id, accessroute from ${openaire_db_name}.result
lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute lateral view explode (instance.accessright.openaccessroute) openaccessroute as accessroute
WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; WHERE datainfo.deletedbyinference=false and datainfo.invisible = FALSE; /*EOS*/

View File

@ -1,4 +1,4 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
@ -8,7 +8,7 @@ set mapred.job.queue.name=analytics;
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.publication_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.publication_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -18,15 +18,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.publication r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.dataset_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.dataset_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -36,15 +36,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.dataset r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.software_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.software_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -54,15 +54,15 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.software r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; DROP TABLE IF EXISTS ${stats_db_name}.otherresearchproduct_refereed purge; /*EOS*/
CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as CREATE TABLE IF NOT EXISTS ${stats_db_name}.otherresearchproduct_refereed STORED AS PARQUET as
with peer_reviewed as ( with peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
@ -72,13 +72,13 @@ non_peer_reviewed as (
select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed select distinct substr(r.id, 4) as id, inst.refereed.classname as refereed
from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst from ${openaire_db_name}.otherresearchproduct r lateral view explode(r.instance) instances as inst
where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed') where r.datainfo.deletedbyinference=false and r.datainfo.invisible = FALSE and inst.refereed.classname='nonPeerReviewed')
select distinct * select /*+ COALESCE(100) */ distinct *
from ( from (
select peer_reviewed.* from peer_reviewed select peer_reviewed.* from peer_reviewed
union all union all
select non_peer_reviewed.* from non_peer_reviewed select non_peer_reviewed.* from non_peer_reviewed
left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id left join peer_reviewed on peer_reviewed.id=non_peer_reviewed.id
where peer_reviewed.id is null) pr; where peer_reviewed.id is null) pr; /*EOS*/
CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as CREATE VIEW IF NOT EXISTS ${stats_db_name}.result_refereed as
select * from ${stats_db_name}.publication_refereed select * from ${stats_db_name}.publication_refereed
@ -87,23 +87,23 @@ select * from ${stats_db_name}.dataset_refereed
union all union all
select * from ${stats_db_name}.software_refereed select * from ${stats_db_name}.software_refereed
union all union all
select * from ${stats_db_name}.otherresearchproduct_refereed; select * from ${stats_db_name}.otherresearchproduct_refereed; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; DROP TABLE IF EXISTS ${stats_db_name}.indi_impact_measures purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as create table if not exists ${stats_db_name}.indi_impact_measures STORED AS PARQUET as
select substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score, select /*+ COALESCE(100) */ substr(id, 4) as id, measures_ids.id impactmetric, cast(measures_ids.unit.value[0] as double) score,
cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class cast(measures_ids.unit.value[0] as decimal(6,3)) score_dec, measures_ids.unit.value[1] impact_class
from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids from ${openaire_db_name}.result lateral view explode(measures) measures as measures_ids
where measures_ids.id!='views' and measures_ids.id!='downloads'; where measures_ids.id!='views' and measures_ids.id!='downloads'; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; DROP TABLE IF EXISTS ${stats_db_name}.result_apc_affiliations purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc_affiliations STORED AS PARQUET as
select distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name, select /*+ COALESCE(100) */ distinct substr(rel.target,4) id, substr(rel.source,4) organization, o.legalname.value name,
cast(rel.properties[0].value as double) apc_amount, cast(rel.properties[0].value as double) apc_amount,
rel.properties[1].value apc_currency rel.properties[1].value apc_currency
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.organization o on o.id=rel.source
join ${openaire_db_name}.result r on r.id=rel.target join ${openaire_db_name}.result r on r.id=rel.target
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; /*EOS*/

View File

@ -1,27 +1,27 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
------------------------------------------- -------------------------------------------
--- Extra tables, mostly used by indicators --- Extra tables, mostly used by indicators
DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; DROP TABLE IF EXISTS ${stats_db_name}.result_projectcount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_projectcount STORED AS PARQUET as
select r.id, count(distinct p.id) as count select /*+ COALESCE(100) */ r.id, count(distinct p.id) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; DROP TABLE IF EXISTS ${stats_db_name}.result_fundercount purge; /*EOS*/
create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as create table if not exists ${stats_db_name}.result_fundercount STORED AS PARQUET as
select r.id, count(distinct p.funder) as count select /*+ COALESCE(100) */ r.id, count(distinct p.funder) as count
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.result_projects rp on rp.id=r.id left outer join ${stats_db_name}.result_projects rp on rp.id=r.id
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
group by r.id; group by r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; DROP TABLE IF EXISTS ${stats_db_name}.project_resultcount purge; /*EOS*/
create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as create table if not exists ${stats_db_name}.project_resultcount STORED AS PARQUET as
with rcount as ( with rcount as (
@ -30,39 +30,39 @@ with rcount as (
left outer join ${stats_db_name}.result_projects rp on rp.project=p.id left outer join ${stats_db_name}.result_projects rp on rp.project=p.id
left outer join ${stats_db_name}.result r on r.id=rp.id left outer join ${stats_db_name}.result r on r.id=rp.id
group by r.type, p.id ) group by r.type, p.id )
select rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications, select /*+ COALESCE(100) */ rcount.pid, sum(case when rcount.type='publication' then rcount.count else 0 end) as publications,
sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets, sum(case when rcount.type='dataset' then rcount.count else 0 end) as datasets,
sum(case when rcount.type='software' then rcount.count else 0 end) as software, sum(case when rcount.type='software' then rcount.count else 0 end) as software,
sum(case when rcount.type='other' then rcount.count else 0 end) as other sum(case when rcount.type='other' then rcount.count else 0 end) as other
from rcount from rcount
group by rcount.pid; group by rcount.pid; /*EOS*/
create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; create or replace view ${stats_db_name}.rndexpenditure as select * from stats_ext.rndexpediture; /*EOS*/
create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; create or replace view ${stats_db_name}.rndgdpexpenditure as select * from stats_ext.rndgdpexpenditure; /*EOS*/
create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; create or replace view ${stats_db_name}.doctoratestudents as select * from stats_ext.doctoratestudents; /*EOS*/
create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; create or replace view ${stats_db_name}.totalresearchers as select * from stats_ext.totalresearchers; /*EOS*/
create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; create or replace view ${stats_db_name}.totalresearchersft as select * from stats_ext.totalresearchersft; /*EOS*/
create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; create or replace view ${stats_db_name}.hrrst as select * from stats_ext.hrrst; /*EOS*/
create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; create or replace view ${stats_db_name}.graduatedoctorates as select * from stats_ext.graduatedoctorates; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; DROP TABLE IF EXISTS ${stats_db_name}.result_instance purge; /*EOS*/
create table if not exists ${stats_db_name}.result_instance stored as parquet as create table if not exists ${stats_db_name}.result_instance stored as parquet as
select distinct r.* select /*+ COALESCE(100) */ distinct r.*
from ( from (
select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom, select substr(r.id, 4) as id, inst.accessright.classname as accessright, inst.accessright.openaccessroute as accessright_uw, substr(inst.collectedfrom.key, 4) as collectedfrom,
substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid substr(inst.hostedby.key, 4) as hostedby, inst.dateofacceptance.value as dateofacceptance, inst.license.value as license, p.qualifier.classname as pidtype, p.value as pid
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst lateral view outer explode(inst.pid) pids as p) r
join ${stats_db_name}.result res on res.id=r.id; join ${stats_db_name}.result res on res.id=r.id; /*EOS*/
DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; DROP TABLE IF EXISTS ${stats_db_name}.result_apc purge; /*EOS*/
create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as create table if not exists ${stats_db_name}.result_apc STORED AS PARQUET as
select distinct r.id, r.amount, r.currency select /*+ COALESCE(100) */ distinct r.id, r.amount, r.currency
from ( from (
select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency select substr(r.id, 4) as id, cast(inst.processingchargeamount.value as float) as amount, inst.processingchargecurrency.value as currency
from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r from ${openaire_db_name}.result r lateral view explode(r.instance) instances as inst) r
join ${stats_db_name}.result res on res.id=r.id join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null; where r.amount is not null; /*EOS*/
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; /*EOS*/

View File

@ -1,7 +1,7 @@
-- Sprint 1 ---- -- Sprint 1 ----
drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_oa purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as create table if not exists ${stats_db_name}.indi_pub_green_oa stored as parquet as
select distinct p.id, coalesce(green_oa, 0) as green_oa select /*+ COALESCE(100) */ distinct p.id, coalesce(green_oa, 0) as green_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as green_oa select p.id, 1 as green_oa
@ -12,7 +12,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_grey_lit purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as create table if not exists ${stats_db_name}.indi_pub_grey_lit stored as parquet as
select distinct p.id, coalesce(grey_lit, 0) as grey_lit select /*+ COALESCE(100) */ distinct p.id, coalesce(grey_lit, 0) as grey_lit
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as grey_lit select p.id, 1 as grey_lit
@ -23,7 +23,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_doi_from_crossref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as create table if not exists ${stats_db_name}.indi_pub_doi_from_crossref stored as parquet as
select distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref select /*+ COALESCE(100) */ distinct p.id, coalesce(doi_from_crossref, 0) as doi_from_crossref
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri select ri.id, 1 as doi_from_crossref from ${stats_db_name}.result_instance ri
@ -33,7 +33,7 @@ left outer join (
-- Sprint 2 ---- -- Sprint 2 ----
drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence stored as parquet as
select distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license select /*+ COALESCE(100) */ distinct r.id, (case when lic='' or lic is null then 0 else 1 end) as has_cc_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select r.id, license.type as lic from ${stats_db_name}.result r select r.id, license.type as lic from ${stats_db_name}.result r
@ -42,7 +42,7 @@ left outer join (
drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_has_cc_licence_url purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as create table if not exists ${stats_db_name}.indi_result_has_cc_licence_url stored as parquet as
select distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url select /*+ COALESCE(100) */ distinct r.id, case when lic_host='' or lic_host is null then 0 else 1 end as has_cc_license_url
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select r.id, lower(parse_url(license.type, "HOST")) as lic_host select r.id, lower(parse_url(license.type, "HOST")) as lic_host
@ -52,12 +52,12 @@ left outer join (
drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_abstract purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as create table if not exists ${stats_db_name}.indi_pub_has_abstract stored as parquet as
select distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract select /*+ COALESCE(100) */ distinct publication.id, cast(coalesce(abstract, true) as int) has_abstract
from ${stats_db_name}.publication; /*EOS*/ from ${stats_db_name}.publication; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_orcid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as create table if not exists ${stats_db_name}.indi_result_with_orcid stored as parquet as
select distinct r.id, coalesce(has_orcid, 0) as has_orcid select /*+ COALESCE(100) */ distinct r.id, coalesce(has_orcid, 0) as has_orcid
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/ select id, 1 as has_orcid from ${stats_db_name}.result_orcid) tmp on r.id= tmp.id; /*EOS*/
@ -66,7 +66,7 @@ left outer join (
---- Sprint 3 ---- ---- Sprint 3 ----
drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funded_result_with_fundref purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as create table if not exists ${stats_db_name}.indi_funded_result_with_fundref stored as parquet as
select distinct r.result as id, coalesce(fundref, 0) as fundref select /*+ COALESCE(100) */ distinct r.result as id, coalesce(fundref, 0) as fundref
from ${stats_db_name}.project_results r from ${stats_db_name}.project_results r
left outer join ( left outer join (
select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/ select distinct result, 1 as fundref from ${stats_db_name}.project_results where provenance='Harvested') tmp on r.result= tmp.result; /*EOS*/
@ -77,7 +77,7 @@ create table if not exists ${stats_db_name}.indi_result_org_collab stored as par
SELECT ro.organization organization, ro.id, o.name SELECT ro.organization organization, ro.id, o.name
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null) join ${stats_db_name}.organization o on o.id=ro.organization where o.name is not null)
select o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations select /*+ COALESCE(100) */ o1.organization org1, o1.name org1name1, o2.organization org2, o2.name org2name2, count(o1.id) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name join tmp as o2 where o1.id=o2.id and o1.organization!=o2.organization and o1.name!=o2.name
group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/ group by o1.organization, o2.organization, o1.name, o2.name; /*EOS*/
@ -89,7 +89,7 @@ create table if not exists ${stats_db_name}.indi_result_org_country_collab store
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
join ${stats_db_name}.organization o on o.id=ro.organization join ${stats_db_name}.organization o on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null) where country <> 'UNKNOWN' and o.name is not null)
select o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name1, o2.country country2, count(o1.id) as collaborations
from tmp as o1 join tmp as o2 on o1.id=o2.id from tmp as o1 join tmp as o2 on o1.id=o2.id
where o1.id=o2.id and o1.country!=o2.country where o1.id=o2.id and o1.country!=o2.country
group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/ group by o1.organization, o1.id, o1.name, o2.country; /*EOS*/
@ -100,7 +100,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org stored as pa
select o.id organization, o.name, ro.project as project select o.id organization, o.name, ro.project as project
from ${stats_db_name}.organization o from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null) join ${stats_db_name}.organization_projects ro on o.id=ro.id where o.name is not null)
select o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations select /*+ COALESCE(100) */ o1.organization org1,o1.name orgname1, o2.organization org2, o2.name orgname2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.name<>o2.name where o1.organization<>o2.organization and o1.name<>o2.name
@ -112,7 +112,7 @@ create table if not exists ${stats_db_name}.indi_project_collab_org_country stor
select o.id organization, o.name, o.country , ro.project as project select o.id organization, o.name, o.country , ro.project as project
from ${stats_db_name}.organization o from ${stats_db_name}.organization o
join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null) join ${stats_db_name}.organization_projects ro on o.id=ro.id and o.country <> 'UNKNOWN' and o.name is not null)
select o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations select /*+ COALESCE(100) */ o1.organization org1,o1.name org1name, o2.country country2, count(distinct o1.project) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.project=o2.project join tmp as o2 on o1.project=o2.project
where o1.organization<>o2.organization and o1.country<>o2.country where o1.organization<>o2.organization and o1.country<>o2.country
@ -124,7 +124,7 @@ create table if not exists ${stats_db_name}.indi_funder_country_collab stored as
join ${stats_db_name}.organization o on o.id=op.id join ${stats_db_name}.organization o on o.id=op.id
join ${stats_db_name}.project p on p.id=op.project join ${stats_db_name}.project p on p.id=op.project
where country <> 'UNKNOWN') where country <> 'UNKNOWN')
select f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations select /*+ COALESCE(100) */ f1.funder, f1.country as country1, f2.country as country2, count(distinct f1.project) as collaborations
from tmp as f1 from tmp as f1
join tmp as f2 on f1.project=f2.project join tmp as f2 on f1.project=f2.project
where f1.country<>f2.country where f1.country<>f2.country
@ -136,7 +136,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
select distinct country, ro.id as result from ${stats_db_name}.organization o select distinct country, ro.id as result from ${stats_db_name}.organization o
join ${stats_db_name}.result_organization ro on o.id=ro.organization join ${stats_db_name}.result_organization ro on o.id=ro.organization
where country <> 'UNKNOWN' and o.name is not null) where country <> 'UNKNOWN' and o.name is not null)
select o1.country country1, o2.country country2, count(o1.result) as collaborations select /*+ COALESCE(100) */ o1.country country1, o2.country country2, count(o1.result) as collaborations
from tmp as o1 from tmp as o1
join tmp as o2 on o1.result=o2.result join tmp as o2 on o1.result=o2.result
where o1.country<>o2.country where o1.country<>o2.country
@ -146,7 +146,7 @@ create table if not exists ${stats_db_name}.indi_result_country_collab stored as
---- Sprint 4 ---- ---- Sprint 4 ----
drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_diamond purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet as
select distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal select /*+ COALESCE(100) */ distinct pd.id, coalesce(in_diamond_journal, 0) as in_diamond_journal
from ${stats_db_name}.publication_datasources pd from ${stats_db_name}.publication_datasources pd
left outer join ( left outer join (
select pd.id, 1 as in_diamond_journal select pd.id, 1 as in_diamond_journal
@ -157,7 +157,7 @@ create table if not exists ${stats_db_name}.indi_pub_diamond stored as parquet a
drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_in_transformative purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as parquet as
select distinct pd.id, coalesce(is_transformative, 0) as is_transformative select /*+ COALESCE(100) */ distinct pd.id, coalesce(is_transformative, 0) as is_transformative
from ${stats_db_name}.publication pd from ${stats_db_name}.publication pd
left outer join ( left outer join (
select pd.id, 1 as is_transformative select pd.id, 1 as is_transformative
@ -168,7 +168,7 @@ create table if not exists ${stats_db_name}.indi_pub_in_transformative stored as
drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_closed_other_open purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as parquet as
select distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open select /*+ COALESCE(100) */ distinct ri.id, coalesce(pub_closed_other_open, 0) as pub_closed_other_open
from ${stats_db_name}.result_instance ri from ${stats_db_name}.result_instance ri
left outer join ( left outer join (
select ri.id, 1 as pub_closed_other_open select ri.id, 1 as pub_closed_other_open
@ -182,14 +182,14 @@ create table if not exists ${stats_db_name}.indi_pub_closed_other_open stored as
---- Sprint 5 ---- ---- Sprint 5 ----
drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_no_of_copies purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as create table if not exists ${stats_db_name}.indi_result_no_of_copies stored as parquet as
select id, count(id) as number_of_copies select /*+ COALESCE(100) */ id, count(id) as number_of_copies
from ${stats_db_name}.result_instance from ${stats_db_name}.result_instance
group by id; /*EOS*/ group by id; /*EOS*/
---- Sprint 6 ---- ---- Sprint 6 ----
drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet as
SELECT result_id, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -197,7 +197,7 @@ create table if not exists ${stats_db_name}.indi_pub_downloads stored as parquet
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored as parquet as
SELECT result_id, repository_id, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, repository_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats from openaire_prod_usage_stats.usage_stats
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -205,14 +205,14 @@ create table if not exists ${stats_db_name}.indi_pub_downloads_datasource stored
drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_year stored as parquet as
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us from openaire_prod_usage_stats.usage_stats us
join ${stats_db_name}.publication on result_id=id where downloads>0 join ${stats_db_name}.publication on result_id=id where downloads>0
GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/ GROUP BY result_id, substring(us.`date`, 1,4); /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_downloads_datasource_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as create table if not exists ${stats_db_name}.indi_pub_downloads_datasource_year stored as parquet as
SELECT result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads SELECT /*+ COALESCE(100) */ result_id, cast(substring(us.`date`, 1,4) as int) as `year`, repository_id, sum(downloads) no_downloads
from openaire_prod_usage_stats.usage_stats us from openaire_prod_usage_stats.usage_stats us
join ${stats_db_name}.publication on result_id=id join ${stats_db_name}.publication on result_id=id
where downloads>0 where downloads>0
@ -241,7 +241,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
UNION ALL UNION ALL
select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo select id, issn_online as issn from ${stats_db_name}.datasource d left semi join gold_oa on gold_oa.issn=d.issn_online) foo
) )
SELECT DISTINCT pd.id, coalesce(is_gold, 0) as is_gold SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_gold, 0) as is_gold
FROM ${stats_db_name}.publication pd FROM ${stats_db_name}.publication pd
left outer join ( left outer join (
select pd.id, 1 as is_gold select pd.id, 1 as is_gold
@ -272,7 +272,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
FROM ${stats_db_name}.datasource FROM ${stats_db_name}.datasource
WHERE issn_online IS NOT NULL ) as issn WHERE issn_online IS NOT NULL ) as issn
WHERE LENGTH(issn) > 7) WHERE LENGTH(issn) > 7)
SELECT DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa SELECT /*+ COALESCE(100) */ DISTINCT pd.id, coalesce(is_hybrid_oa, 0) as is_hybrid_oa
FROM ${stats_db_name}.publication_datasources pd FROM ${stats_db_name}.publication_datasources pd
LEFT OUTER JOIN ( LEFT OUTER JOIN (
SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd SELECT pd.id, 1 as is_hybrid_oa from ${stats_db_name}.publication_datasources pd
@ -284,7 +284,7 @@ create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as
drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as create table if not exists ${stats_db_name}.indi_pub_hybrid stored as parquet as
select distinct p.id, coalesce(is_hybrid, 0) is_hybrid select /*+ COALESCE(100) */ distinct p.id, coalesce(is_hybrid, 0) is_hybrid
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as is_hybrid select p.id, 1 as is_hybrid
@ -313,7 +313,7 @@ create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet
where cast(year as int)>2003 where cast(year as int)>2003
group by ro.organization) group by ro.organization)
--return results_fair/all_results --return results_fair/all_results
select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select /*+ COALESCE(100) */ allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization; /*EOS*/ join result_fair on result_fair.organization=allresults.organization; /*EOS*/
@ -336,7 +336,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub_pr purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub_pr stored as parquet as
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar from allresults ar
join result_fair rf on rf.organization=ar.organization; /*EOS*/ join result_fair rf on rf.organization=ar.organization; /*EOS*/
@ -357,7 +357,7 @@ CREATE TEMPORARY VIEW allresults as select year, ro.organization, count(distinct
drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub_year stored as parquet as
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/ join result_fair on result_fair.organization=allresults.organization and result_fair.year=allresults.year; /*EOS*/
@ -381,7 +381,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness select /*+ COALESCE(100) */ ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar join result_fair rf from allresults ar join result_fair rf
on rf.organization=ar.organization; /*EOS*/ on rf.organization=ar.organization; /*EOS*/
@ -404,7 +404,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness_year stored as parquet as
select cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness
from allresults from allresults
join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/ join result_fair on result_fair.organization=allresults.organization and cast(result_fair.year as int)=cast(allresults.year as int); /*EOS*/
@ -427,7 +427,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_findable_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_findable_year stored as parquet as
select cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select /*+ COALESCE(100) */ cast(allresults.year as int) year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/ join result_with_pid on result_with_pid.organization=allresults.organization and cast(result_with_pid.year as int)=cast(allresults.year as int); /*EOS*/
@ -450,7 +450,7 @@ select ro.organization, count(distinct ro.id) no_allresults from ${stats_db_name
drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_findable purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as create table if not exists ${stats_db_name}.indi_org_findable stored as parquet as
select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable select /*+ COALESCE(100) */ allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable
from allresults from allresults
join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/ join result_with_pid on result_with_pid.organization=allresults.organization; /*EOS*/
@ -516,7 +516,7 @@ select software_oa.organization, software_oa.no_oasoftware/allsoftware.no_allsof
drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as create table if not exists ${stats_db_name}.indi_org_openess stored as parquet as
select allpubsshare.organization, select /*+ COALESCE(100) */ allpubsshare.organization,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
@ -593,7 +593,7 @@ select allsoftware.year, software_oa.organization, software_oa.no_oasoftware/all
drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_openess_year purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as create table if not exists ${stats_db_name}.indi_org_openess_year stored as parquet as
select cast(allpubsshare.year as int) year, allpubsshare.organization, select /*+ COALESCE(100) */ cast(allpubsshare.year as int) year, allpubsshare.organization,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
org_openess FROM allpubsshare org_openess FROM allpubsshare
@ -617,7 +617,7 @@ DROP VIEW allsoftwaresshare; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_has_preprint purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as create table if not exists ${stats_db_name}.indi_pub_has_preprint stored as parquet as
select distinct p.id, coalesce(has_preprint, 0) as has_preprint select /*+ COALESCE(100) */ distinct p.id, coalesce(has_preprint, 0) as has_preprint
from ${stats_db_name}.publication_classifications p from ${stats_db_name}.publication_classifications p
left outer join ( left outer join (
select p.id, 1 as has_preprint select p.id, 1 as has_preprint
@ -627,7 +627,7 @@ from ${stats_db_name}.publication_classifications p
drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_in_subscribed purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as create table if not exists ${stats_db_name}.indi_pub_in_subscribed stored as parquet as
select distinct p.id, coalesce(is_subscription, 0) as is_subscription select /*+ COALESCE(100) */ distinct p.id, coalesce(is_subscription, 0) as is_subscription
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join( left outer join(
select p.id, 1 as is_subscription from ${stats_db_name}.publication p select p.id, 1 as is_subscription from ${stats_db_name}.publication p
@ -640,7 +640,7 @@ from ${stats_db_name}.publication p
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid select /*+ COALESCE(100) */ distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from ${stats_db_name}.result p from ${stats_db_name}.result p
left outer join ( left outer join (
select p.id, 1 as result_with_pid select p.id, 1 as result_with_pid
@ -654,7 +654,7 @@ group by rf.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
select distinct p.id as id, coalesce(is_interdisciplinary, 0) select /*+ COALESCE(100) */ distinct p.id as id, coalesce(is_interdisciplinary, 0)
as is_interdisciplinary as is_interdisciplinary
from pub_fos_totals p from pub_fos_totals p
left outer join ( left outer join (
@ -666,7 +666,7 @@ drop view pub_fos_totals; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_bronze_oa purge; /*EOS*/
create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as create table ${stats_db_name}.indi_pub_bronze_oa stored as parquet as
select distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa select /*+ COALESCE(100) */ distinct p.id,coalesce(is_bronze_oa,0) is_bronze_oa
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select p.id, 1 as is_bronze_oa select p.id, 1 as is_bronze_oa
@ -689,7 +689,7 @@ where p.end_year is NOT NULL and r.year is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_project_result_after purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as create table if not exists ${stats_db_name}.indi_is_project_result_after stored as parquet as
select pry.project_id, pry.acronym, pry.result_id, select /*+ COALESCE(100) */ pry.project_id, pry.acronym, pry.result_id,
coalesce(is_project_result_after, 0) as is_project_result_after coalesce(is_project_result_after, 0) as is_project_result_after
from project_year_result_year pry from project_year_result_year pry
left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after left outer join (select pry.project_id, pry.acronym, pry.result_id, 1 as is_project_result_after
@ -701,7 +701,7 @@ drop view project_year_result_year; /*EOS*/
drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_is_funder_plan_s purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as create table if not exists ${stats_db_name}.indi_is_funder_plan_s stored as parquet as
select distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s select /*+ COALESCE(100) */ distinct f.id, f.name, coalesce(is_funder_plan_s, 0) as is_funder_plan_s
from ${stats_db_name}.funder f from ${stats_db_name}.funder f
left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder left outer join (select id, name, 1 as is_funder_plan_s from ${stats_db_name}.funder
join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp join stats_ext.plan_s_short on c_o_alition_s_organisation_funder=name) tmp
@ -722,7 +722,7 @@ create table if not exists ${stats_db_name}.indi_funder_fairness stored as parqu
join ${stats_db_name}.project p on p.id=rp.project join ${stats_db_name}.project p on p.id=rp.project
where cast(year as int)>2003 where cast(year as int)>2003
group by p.funder) group by p.funder)
select allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness select /*+ COALESCE(100) */ allresults.funder, result_fair.no_result_fair/allresults.no_allresults funder_fairness
from allresults from allresults
join result_fair on result_fair.funder=allresults.funder; /*EOS*/ join result_fair on result_fair.funder=allresults.funder; /*EOS*/
@ -745,7 +745,7 @@ allresults as
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
where cast(year as int)>2003 where cast(year as int)>2003
group by rc.ri_initiative) group by rc.ri_initiative)
select allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness select /*+ COALESCE(100) */ allresults.ri_initiative, result_fair.no_result_fair/allresults.no_allresults ris_fairness
from allresults from allresults
join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/ join result_fair on result_fair.ri_initiative=allresults.ri_initiative; /*EOS*/
@ -817,15 +817,13 @@ select software_oa.funder, software_oa.no_oasoftware/allsoftware.no_allsoftware
drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_funder_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as create table if not exists ${stats_db_name}.indi_funder_openess stored as parquet as
select allpubsshare.funder, select /*+ COALESCE(100) */ allpubsshare.funder,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end)) funder_openess
funder_openess FROM allpubsshare FROM allpubsshare
left outer join (select funder,d from left outer join (select funder,d from alldatasetssshare) tmp1
alldatasetssshare) tmp1
on tmp1.funder=allpubsshare.funder on tmp1.funder=allpubsshare.funder
left outer join (select funder,s from left outer join (select funder,s from allsoftwaresshare) tmp2
allsoftwaresshare) tmp2
on tmp2.funder=allpubsshare.funder; /*EOS*/ on tmp2.funder=allpubsshare.funder; /*EOS*/
DROP VIEW pubs_oa; /*EOS*/ DROP VIEW pubs_oa; /*EOS*/
@ -905,7 +903,7 @@ select software_oa.ri_initiative, software_oa.no_oasoftware/allsoftware.no_allso
drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_ris_openess purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as create table if not exists ${stats_db_name}.indi_ris_openess stored as parquet as
select allpubsshare.ri_initiative, select /*+ COALESCE(100) */ allpubsshare.ri_initiative,
(p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end) (p+if(isnull(s),0,s)+if(isnull(d),0,d))/(1+(case when s is null then 0 else 1 end)
+(case when d is null then 0 else 1 end)) +(case when d is null then 0 else 1 end))
ris_openess FROM allpubsshare ris_openess FROM allpubsshare
@ -943,7 +941,7 @@ with result_findable as
join ${stats_db_name}.project p on p.id=rp.project join ${stats_db_name}.project p on p.id=rp.project
where cast(year as int)>2003 where cast(year as int)>2003
group by p.funder) group by p.funder)
select allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable select /*+ COALESCE(100) */ allresults.funder, result_findable.no_result_findable/allresults.no_allresults funder_findable
from allresults from allresults
join result_findable on result_findable.funder=allresults.funder; /*EOS*/ join result_findable on result_findable.funder=allresults.funder; /*EOS*/
@ -952,41 +950,43 @@ drop table if exists ${stats_db_name}.indi_ris_findable purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as create table if not exists ${stats_db_name}.indi_ris_findable stored as parquet as
with result_contexts as with result_contexts as
(select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc (select distinct rc.id, context.name ri_initiative from ${stats_db_name}.result_concepts rc
join ${stats_db_name}.concept on concept.id=rc.concept join ${stats_db_name}.concept on concept.id=rc.concept
join ${stats_db_name}.category on category.id=concept.category join ${stats_db_name}.category on category.id=concept.category
join ${stats_db_name}.context on context.id=category.context), join ${stats_db_name}.context on context.id=category.context),
result_findable as result_findable as
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_result_findable from result_contexts rc
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
join ${stats_db_name}.result_pids rp on rp.id=r.id join ${stats_db_name}.result_pids rp on rp.id=r.id
where cast(r.year as int)>2003 where cast(r.year as int)>2003
group by rc.ri_initiative), group by rc.ri_initiative),
allresults as allresults as
(select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc (select rc.ri_initiative ri_initiative, count(distinct rc.id) no_allresults from result_contexts rc
join ${stats_db_name}.result r on r.id=rc.id join ${stats_db_name}.result r on r.id=rc.id
where cast(r.year as int)>2003 where cast(r.year as int)>2003
group by rc.ri_initiative) group by rc.ri_initiative)
select allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable select /*+ COALESCE(100) */ allresults.ri_initiative, result_findable.no_result_findable/allresults.no_allresults ris_findable
from allresults from allresults
join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/ join result_findable on result_findable.ri_initiative=allresults.ri_initiative; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_publicly_funded purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as create table if not exists ${stats_db_name}.indi_pub_publicly_funded stored as parquet as
with org_names_pids as with org_names_pids as
(select org.id,name, pid from ${stats_db_name}.organization org (select org.id,name, pid from ${stats_db_name}.organization org
join ${stats_db_name}.organization_pids op on org.id=op.id), join ${stats_db_name}.organization_pids op on org.id=op.id),
publicly_funded_orgs as publicly_funded_orgs as
(select distinct name from (select distinct name from
(select pf.name from stats_ext.insitutions_for_publicly_funded pf (select pf.name from stats_ext.insitutions_for_publicly_funded pf
join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government' join ${stats_db_name}.fundref f on f.name=pf.name where f.type='government'
union all union all
select pf.name from stats_ext.insitutions_for_publicly_funded pf select pf.name from stats_ext.insitutions_for_publicly_funded pf
join ${stats_db_name}.project p on p.funder=pf.name join ${stats_db_name}.project p on p.funder=pf.name
union all union all
select op.name from stats_ext.insitutions_for_publicly_funded pf select op.name from stats_ext.insitutions_for_publicly_funded pf
join org_names_pids op on (op.name=pf.name or op.pid=pf.ror) join org_names_pids op on (op.name=pf.name or op.pid=pf.ror)
and pf.publicly_funded='yes') foo) and pf.publicly_funded='yes') foo)
select distinct p.id, coalesce(publicly_funded, 0) as publicly_funded select /*+ COALESCE(100) */ distinct p.id, coalesce(publicly_funded, 0) as publicly_funded
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro select distinct ro.id, 1 as publicly_funded from ${stats_db_name}.result_organization ro
@ -995,7 +995,7 @@ join publicly_funded_orgs pfo on o.name=pfo.name) tmp on p.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_green_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as create table ${stats_db_name}.indi_pub_green_with_license stored as parquet as
select distinct p.id, coalesce(green_with_license, 0) as green_with_license select /*+ COALESCE(100) */ distinct p.id, coalesce(green_with_license, 0) as green_with_license
from ${stats_db_name}.publication p from ${stats_db_name}.publication p
left outer join ( left outer join (
select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p select distinct p.id, 1 as green_with_license from ${stats_db_name}.publication p
@ -1006,7 +1006,7 @@ left outer join (
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
create table ${stats_db_name}.result_country stored as parquet as create table ${stats_db_name}.result_country stored as parquet as
select distinct id, country select /*+ COALESCE(100) */ distinct id, country
from ( from (
select ro.id, o.country select ro.id, o.country
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
@ -1021,7 +1021,7 @@ where rc.country is not null; /*EOS*/
drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_with_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as create table ${stats_db_name}.indi_result_oa_with_license stored as parquet as
select distinct r.id, coalesce(oa_with_license,0) as oa_with_license select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_with_license,0) as oa_with_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r left outer join (select distinct r.id, 1 as oa_with_license from ${stats_db_name}.result r
join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open Access') tmp on r.id=tmp.id; /*EOS*/
@ -1029,9 +1029,9 @@ join ${stats_db_name}.result_licenses rl on rl.id=r.id where r.bestlicence='Open
drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_oa_without_license purge; /*EOS*/
create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as create table ${stats_db_name}.indi_result_oa_without_license stored as parquet as
with without_license as with without_license as
(select distinct id from ${stats_db_name}.indi_result_oa_with_license (select distinct id from ${stats_db_name}.indi_result_oa_with_license
where oa_with_license=0) where oa_with_license=0)
select distinct r.id, coalesce(oa_without_license,0) as oa_without_license select /*+ COALESCE(100) */ distinct r.id, coalesce(oa_without_license,0) as oa_without_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join (select distinct r.id, 1 as oa_without_license left outer join (select distinct r.id, 1 as oa_without_license
from ${stats_db_name}.result r from ${stats_db_name}.result r
@ -1042,7 +1042,7 @@ drop table if exists ${stats_db_name}.indi_result_under_transformative purge; /*
create table ${stats_db_name}.indi_result_under_transformative stored as parquet as create table ${stats_db_name}.indi_result_under_transformative stored as parquet as
with transformative_dois as ( with transformative_dois as (
select distinct doi from stats_ext.transformative_facts) select distinct doi from stats_ext.transformative_facts)
select distinct r.id, coalesce(under_transformative,0) as under_transformative select /*+ COALESCE(100) */ distinct r.id, coalesce(under_transformative,0) as under_transformative
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ( left outer join (
select distinct rp.id, 1 as under_transformative select distinct rp.id, 1 as under_transformative

View File

@ -1,30 +1,30 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
---------------------------------------------------- ----------------------------------------------------
-- Shortcuts for various definitions in stats db --- -- Shortcuts for various definitions in stats db ---
---------------------------------------------------- ----------------------------------------------------
-- Peer reviewed: -- Peer reviewed:
drop table if exists ${stats_db_name}.result_peerreviewed purge; drop table if exists ${stats_db_name}.result_peerreviewed purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as
select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed select /*+ COALESCE(100) */ r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id
left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; /*EOS*/
-- Green OA: -- Green OA:
drop table if exists ${stats_db_name}.result_greenoa purge; drop table if exists ${stats_db_name}.result_greenoa purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as
select r.id, case when green.green_oa=1 then true else false end as green select /*+ COALESCE(100) */ r.id, case when green.green_oa=1 then true else false end as green
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; /*EOS*/
-- GOLD OA: -- GOLD OA:
drop table if exists ${stats_db_name}.result_gold purge; drop table if exists ${stats_db_name}.result_gold purge; /*EOS*/
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
select r.id, case when gold.is_gold=1 then true else false end as gold select /*+ COALESCE(100) */ r.id, case when gold.is_gold=1 then true else false end as gold
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; /*EOS*/

View File

@ -1,58 +1,26 @@
set mapred.job.queue.name=analytics; set mapred.job.queue.name=analytics; /*EOS*/
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- replace the creation of the result view with a table, which will include the boolean fields from the previous tables (green, gold,
-- peer reviewed) -- peer reviewed)
drop table if exists ${stats_db_name}.result_tmp;
CREATE TABLE ${stats_db_name}.result_tmp ( drop view if exists ${stats_db_name}.result; /*EOS*/
id STRING, drop table if exists ${stats_db_name}.result; /*EOS*/
title STRING,
publisher STRING,
journal STRING,
`date` STRING,
`year` INT,
bestlicence STRING,
access_mode STRING,
embargo_end_date STRING,
delayed BOOLEAN,
authors INT,
source STRING,
abstract BOOLEAN,
type STRING ,
peer_reviewed BOOLEAN,
green BOOLEAN,
gold BOOLEAN)
clustered by (id) into 100 buckets stored as orc tblproperties('transactional'='true');
insert into ${stats_db_name}.result_tmp CREATE TABLE ${stats_db_name}.result stored as parquet as
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold SELECT /*+ COALESCE(100) */ r.id, r.title, r.publisher, r.journal, r.`date`, DATE_FORMAT(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.publication r FROM (
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.publication)
UNION ALL
(SELECT id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.dataset)
UNION ALL
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.software)
UNION ALL
(select id, title, p.publisher, journal, `date`, DATE_FORMAT(`date`, 'yyyy'), bestlicence, bestlicence, embargo_end_date, delayed, authors, source, abstract, type
FROM ${stats_db_name}.otherresearchproduct)
) r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id; /*EOS*/
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.dataset r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.software r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
insert into ${stats_db_name}.result_tmp
select r.id, r.title, r.publisher, r.journal, r.`date`, date_format(r.`date`, 'yyyy'), r.bestlicence, r.bestlicence, r.embargo_end_date, r.delayed, r.authors, r.source, r.abstract, r.type, pr.peer_reviewed, green.green, gold.gold
FROM ${stats_db_name}.otherresearchproduct r
LEFT OUTER JOIN ${stats_db_name}.result_peerreviewed pr on pr.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_greenoa green on green.id=r.id
LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
drop table if exists ${stats_db_name}.result;
drop view if exists ${stats_db_name}.result;
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
drop table ${stats_db_name}.result_tmp;

Some files were not shown because too many files have changed in this diff Show More