Compare commits
204 Commits
main
...
csv_collec
Author | SHA1 | Date |
---|---|---|
Michele Artini | 48c9070893 | |
Michele Artini | d8f5be1149 | |
Michele Artini | 3777f3c15d | |
Miriam Baglioni | ce22b1d536 | |
Miriam Baglioni | 19a9bddab1 | |
Miriam Baglioni | 69dad7e2bf | |
Miriam Baglioni | 9657707ab0 | |
Sandro La Bruzzo | dd6ed31383 | |
Sandro La Bruzzo | 0d05006114 | |
Claudio Atzori | e4b814b3f1 | |
Claudio Atzori | 5c7f7fb3b8 | |
Claudio Atzori | 9e6b1f2f24 | |
Miriam Baglioni | 666155bafa | |
Miriam Baglioni | ee84db7a6a | |
Claudio Atzori | 77308ed525 | |
Miriam Baglioni | 302c4d044e | |
Claudio Atzori | 60da306830 | |
Claudio Atzori | 8a5ba8df45 | |
Claudio Atzori | dade7d5bb8 | |
Claudio Atzori | f57446ad16 | |
Michele De Bonis | 1c144a4dcb | |
Sandro La Bruzzo | fd1038b44d | |
Giambattista Bloisi | fed13e083e | |
Michele De Bonis | 6af3fd16b6 | |
Michele De Bonis | bde59a7c8f | |
sandro.labruzzo | 730a7751b6 | |
Miriam Baglioni | 89b7bc84f2 | |
sandro.labruzzo | 5f134c4045 | |
sandro.labruzzo | 4034da7579 | |
sandro.labruzzo | 32e2a8b340 | |
Michele Artini | 65902a87e3 | |
sandro.labruzzo | cc6bbbb804 | |
sandro.labruzzo | 0517e452e3 | |
Miriam Baglioni | ca2d480df3 | |
Claudio Atzori | 2e54715d71 | |
Miriam Baglioni | 189a7c255a | |
Miriam Baglioni | 821700299a | |
Miriam Baglioni | e5b04e61ff | |
Claudio Atzori | 15227f82b8 | |
sandro.labruzzo | ac8995ab64 | |
sandro.labruzzo | 496007188a | |
Claudio Atzori | 4e55ddc547 | |
Claudio Atzori | ef51a60f19 | |
Claudio Atzori | ff5cb32067 | |
Claudio Atzori | a48d080e08 | |
Claudio Atzori | 5d34432398 | |
sandro.labruzzo | a1297082e2 | |
Michele De Bonis | c97facf5e6 | |
Claudio Atzori | 9e439f5eca | |
Claudio Atzori | cf7d9a32ab | |
Claudio Atzori | 5f512f510e | |
Claudio Atzori | b95672b420 | |
Claudio Atzori | 9e8849b753 | |
sandro.labruzzo | 4778a70478 | |
Claudio Atzori | 4a3b173ca2 | |
sandro.labruzzo | ac0a94d62d | |
Giambattista Bloisi | 5ee8881646 | |
Miriam Baglioni | fb1f0f8850 | |
Giambattista Bloisi | 5b4d821bf9 | |
Giambattista Bloisi | 03c262ccb9 | |
sandro.labruzzo | a1d5ad5c26 | |
sandro.labruzzo | b0478c380e | |
Claudio Atzori | 07f267bb10 | |
Claudio Atzori | 8088943399 | |
Claudio Atzori | 6c5df761e2 | |
Claudio Atzori | 9f7a606ddd | |
Miriam Baglioni | 250f101779 | |
Miriam Baglioni | f1ea9da5bc | |
Miriam Baglioni | b0283fe94c | |
sandro.labruzzo | 474f365286 | |
sandro.labruzzo | 19ce783e58 | |
Sandro La Bruzzo | 0d0904f4ec | |
Giambattista Bloisi | f31f22801f | |
Miriam Baglioni | 6fd9ec8566 | |
Giambattista Bloisi | 8f5171557e | |
Claudio Atzori | f7bb53fe78 | |
Claudio Atzori | 973aa7dca6 | |
Sandro La Bruzzo | c1cef5d685 | |
Sandro La Bruzzo | a8ed5a3b04 | |
Claudio Atzori | a42c8b7c85 | |
Claudio Atzori | a877c76d70 | |
Claudio Atzori | 26cdc7e439 | |
Claudio Atzori | 323c76eafc | |
Miriam Baglioni | 69aee609ef | |
Claudio Atzori | 5ca031c8d6 | |
Claudio Atzori | 499892b67c | |
Claudio Atzori | e4504fd98d | |
Claudio Atzori | 9b4415cb67 | |
Claudio Atzori | e6ca382deb | |
Claudio Atzori | 940735921f | |
Giambattista Bloisi | 56224e034a | |
Miriam Baglioni | 5916346ba1 | |
Claudio Atzori | e4abe55988 | |
Claudio Atzori | d71df6de19 | |
Claudio Atzori | 1cdcd07a7e | |
Claudio Atzori | 6fd50266f1 | |
Claudio Atzori | dffa376eb6 | |
Claudio Atzori | 32fa579b80 | |
Claudio Atzori | 67e37f41fb | |
Miriam Baglioni | 0fb6af5586 | |
Claudio Atzori | dcba5ad32a | |
Claudio Atzori | 46dbb62598 | |
Claudio Atzori | d3764265d5 | |
Claudio Atzori | 4a9aeb6238 | |
Claudio Atzori | 8172bee8c8 | |
Miriam Baglioni | 1fce7d5a0f | |
Miriam Baglioni | 842cc75dae | |
Miriam Baglioni | e75326d6ec | |
Miriam Baglioni | 32f444984e | |
Miriam Baglioni | cab8f1135f | |
Miriam Baglioni | c93bf82487 | |
Miriam Baglioni | a7699558ed | |
Miriam Baglioni | 01679c935a | |
Miriam Baglioni | c773421cc7 | |
Miriam Baglioni | cf07ed9058 | |
Miriam Baglioni | c921cf7ee0 | |
Giambattista Bloisi | 6bc741715c | |
Giambattista Bloisi | aa7b8fd014 | |
Giambattista Bloisi | 0e34b0ece1 | |
Miriam Baglioni | aac5eb3499 | |
Miriam Baglioni | 821540f94a | |
Miriam Baglioni | 09a2c93fc7 | |
Miriam Baglioni | ce4ee1189f | |
Miriam Baglioni | 2b27afaec8 | |
Miriam Baglioni | 0e5dd14538 | |
Michele De Bonis | 6c17993d16 | |
Michele De Bonis | eab623ddfa | |
Michele De Bonis | 5015ba10eb | |
Giambattista Bloisi | 56b05cde0b | |
Michele De Bonis | 62c4c3ed29 | |
Claudio Atzori | 62ff843334 | |
Claudio Atzori | d5867a1992 | |
Claudio Atzori | e5df68772d | |
Miriam Baglioni | 7e6d12fa77 | |
Miriam Baglioni | 191fc3a461 | |
Claudio Atzori | 10696f2a44 | |
Claudio Atzori | 5734b80861 | |
Antonis Lempesis | f3c179658a | |
Miriam Baglioni | b18ad035c1 | |
Miriam Baglioni | e430826e00 | |
Giambattista Bloisi | c45cae447a | |
Claudio Atzori | 3fcafc7ed6 | |
Miriam Baglioni | 599e56dbc6 | |
Claudio Atzori | 6397141e56 | |
Claudio Atzori | e354f9853a | |
Claudio Atzori | 535a7b99f1 | |
Sandro La Bruzzo | 6a097abc89 | |
Michele Artini | 9754521847 | |
Michele Artini | 54f8b4da39 | |
Claudio Atzori | 4f0463d779 | |
Miriam Baglioni | 4d3e079590 | |
Claudio Atzori | d1cadc77c9 | |
Michele Artini | 0e89d4a1cf | |
Michele Artini | e941adbe2b | |
Michele Artini | 7f81673f3c | |
Michele Artini | fdbe629f49 | |
Antonis Lempesis | 619aa34a15 | |
Antonis Lempesis | dbea7a4072 | |
Antonis Lempesis | c9241dba0d | |
Claudio Atzori | e0ff84baf0 | |
Michele Artini | 755a5aefcf | |
Claudio Atzori | 5f86c93be6 | |
Michele Artini | db6f137cf9 | |
Claudio Atzori | 23e0ab3a7c | |
Michele De Bonis | 6df6b4583e | |
Alessia | 07e6e7b4d6 | |
Antonis Lempesis | 37ad259296 | |
Antonis Lempesis | b64c144abf | |
Serafeim Chatzopoulos | b043f8a963 | |
Serafeim Chatzopoulos | db03f85366 | |
Miriam Baglioni | 468f2aa5a5 | |
Miriam Baglioni | 89fcf4086c | |
Miriam Baglioni | 45605f93ae | |
Miriam Baglioni | 5a7ba77271 | |
Miriam Baglioni | 8c185a7b1a | |
Claudio Atzori | e16616b964 | |
Miriam Baglioni | 985ca15264 | |
Claudio Atzori | 0bf76f2a34 | |
Claudio Atzori | 975d44cac7 | |
Claudio Atzori | 6bdb8643e6 | |
Claudio Atzori | 9486e21a44 | |
Claudio Atzori | 75a11d0ba5 | |
Antonis Lempesis | d0590e0e49 | |
Antonis Lempesis | 7d2c0a3723 | |
Lampros Smyrnaios | e9686365a2 | |
Lampros Smyrnaios | ce0aee21cc | |
Lampros Smyrnaios | 7b7dd32ad5 | |
Lampros Smyrnaios | 7ce051d766 | |
Lampros Smyrnaios | aa4d7d5e20 | |
Lampros Smyrnaios | 54e11b6a43 | |
Lampros Smyrnaios | fe2275a9b0 | |
Lampros Smyrnaios | a644a6f4fe | |
Lampros Smyrnaios | 888637773c | |
Lampros Smyrnaios | e0ac494859 | |
Lampros Smyrnaios | 3c17183d10 | |
Lampros Smyrnaios | 69a9ac7393 | |
Lampros Smyrnaios | 342223f75c | |
Lampros Smyrnaios | 2616971e2b | |
Lampros Smyrnaios | ba533d9f34 | |
Lampros Smyrnaios | d46b78b659 | |
Lampros Smyrnaios | 6f2ebb2a52 | |
Lampros Smyrnaios | ca091c0f1e | |
Lampros Smyrnaios | 0b897f2f66 | |
Lampros Smyrnaios | db33f7727c |
|
@ -28,3 +28,4 @@ spark-warehouse
|
|||
/**/.scalafmt.conf
|
||||
/.java-version
|
||||
/dhp-shade-package/dependency-reduced-pom.xml
|
||||
/**/job.properties
|
||||
|
|
|
@ -10,6 +10,11 @@ public class Constants {
|
|||
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
||||
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
|
||||
|
||||
public static final String RAID_NS_PREFIX = "raid________";
|
||||
|
||||
public static final String END_DATE = "endDate";
|
||||
public static final String START_DATE = "startDate";
|
||||
|
||||
public static final String ROR_NS_PREFIX = "ror_________";
|
||||
|
||||
public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222";
|
||||
|
|
|
@ -212,11 +212,11 @@ public class HttpConnector2 {
|
|||
.format(
|
||||
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||
MAPPER.writeValueAsString(report)));
|
||||
} catch (MalformedURLException | UnknownHostException e) {
|
||||
} catch (MalformedURLException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e.getMessage(), e);
|
||||
} catch (SocketTimeoutException | SocketException e) {
|
||||
} catch (SocketTimeoutException | SocketException | UnknownHostException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.personentity;
|
||||
package eu.dnetlib.dhp.common.person;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
@ -61,7 +61,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
|
|||
private Relation getRelation(String orcid1, String orcid2) {
|
||||
String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
|
||||
String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
|
||||
return OafMapperUtils
|
||||
Relation relation = OafMapperUtils
|
||||
.getRelation(
|
||||
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
|
||||
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
||||
|
@ -76,5 +76,7 @@ public class CoAuthorshipIterator implements Iterator<Relation> {
|
|||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91"),
|
||||
null);
|
||||
relation.setValidated(true);
|
||||
return relation;
|
||||
}
|
||||
}
|
|
@ -1,12 +1,9 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.personentity;
|
||||
package eu.dnetlib.dhp.common.person;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class Coauthors implements Serializable {
|
||||
private List<String> coauthors;
|
||||
|
|
@ -2,8 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.col;
|
||||
import static org.apache.spark.sql.functions.when;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
@ -135,7 +134,9 @@ public class GroupEntitiesSparkJob {
|
|||
.applyCoarVocabularies(entity, vocs),
|
||||
OAFENTITY_KRYO_ENC)
|
||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, OafEntity, OafEntity>) (key, group) -> MergeUtils.mergeById(group, vocs),
|
||||
OAFENTITY_KRYO_ENC)
|
||||
.map(
|
||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||
t.getClass().getName(), t),
|
||||
|
|
|
@ -65,7 +65,13 @@ public class RunSQLSparkJob {
|
|||
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
|
||||
log.info("executing: {}", statement);
|
||||
long startTime = System.currentTimeMillis();
|
||||
spark.sql(statement).show();
|
||||
try {
|
||||
spark.sql(statement).show();
|
||||
} catch (Exception e) {
|
||||
log.error("Error executing statement: {}", statement, e);
|
||||
System.err.println("Error executing statement: " + statement + "\n" + e);
|
||||
throw e;
|
||||
}
|
||||
log
|
||||
.info(
|
||||
"executed in {}",
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2024.
|
||||
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.apache.commons.lang3.builder.EqualsBuilder;
|
||||
import org.apache.commons.lang3.builder.HashCodeBuilder;
|
||||
|
||||
public class HashableStructuredProperty extends StructuredProperty {
|
||||
|
||||
private static final long serialVersionUID = 8371670185221126045L;
|
||||
|
||||
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
final HashableStructuredProperty sp = new HashableStructuredProperty();
|
||||
sp.setValue(value);
|
||||
sp.setQualifier(qualifier);
|
||||
sp.setDataInfo(dataInfo);
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
|
||||
HashableStructuredProperty hsp = new HashableStructuredProperty();
|
||||
hsp.setQualifier(sp.getQualifier());
|
||||
hsp.setValue(sp.getValue());
|
||||
hsp.setQualifier(sp.getQualifier());
|
||||
return hsp;
|
||||
}
|
||||
|
||||
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setQualifier(hsp.getQualifier());
|
||||
sp.setValue(hsp.getValue());
|
||||
sp.setQualifier(hsp.getQualifier());
|
||||
return sp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return new HashCodeBuilder(11, 91)
|
||||
.append(getQualifier().getClassid())
|
||||
.append(getQualifier().getSchemeid())
|
||||
.append(getValue())
|
||||
.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (obj == this) {
|
||||
return true;
|
||||
}
|
||||
if (obj.getClass() != getClass()) {
|
||||
return false;
|
||||
}
|
||||
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
|
||||
return new EqualsBuilder()
|
||||
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
|
||||
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
|
||||
.append(getValue(), rhs.getValue())
|
||||
.isEquals();
|
||||
}
|
||||
}
|
|
@ -43,34 +43,4 @@ public class CleaningFunctions {
|
|||
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that normalises PID values on a per-type basis.
|
||||
* @param pid the PID whose value will be normalised.
|
||||
* @return the PID containing the normalised value.
|
||||
*/
|
||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||
pid
|
||||
.setValue(
|
||||
normalizePidValue(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()));
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
public static String normalizePidValue(String pidType, String pidValue) {
|
||||
String value = Optional
|
||||
.ofNullable(pidValue)
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
|
||||
switch (pidType) {
|
||||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
case "doi":
|
||||
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,18 +6,11 @@ import org.apache.commons.lang3.StringUtils;
|
|||
public class DoiCleaningRule {
|
||||
|
||||
public static String clean(final String doi) {
|
||||
return doi
|
||||
.toLowerCase()
|
||||
.replaceAll("\\s", "")
|
||||
.replaceAll("^doi:", "")
|
||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
}
|
||||
|
||||
public static String normalizeDoi(final String input) {
|
||||
if (input == null)
|
||||
if (doi == null)
|
||||
return null;
|
||||
final String replaced = input
|
||||
final String replaced = doi
|
||||
.replaceAll("\\n|\\r|\\t|\\s", "")
|
||||
.replaceAll("^doi:", "")
|
||||
.toLowerCase()
|
||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
if (StringUtils.isEmpty(replaced))
|
||||
|
@ -32,7 +25,6 @@ public class DoiCleaningRule {
|
|||
return null;
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -363,6 +362,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Person) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||
|
@ -563,12 +564,24 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.ifPresent(pid -> {
|
||||
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
|
||||
final Set<HashableStructuredProperty> pids = pid
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::newInstance)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.ifPresent(altId -> {
|
||||
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
|
||||
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
|
||||
final Set<HashableStructuredProperty> altIds = altId
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::newInstance)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
i
|
||||
.setAlternateIdentifier(
|
||||
Sets
|
||||
.difference(altIds, pids)
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::toStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -682,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
}
|
||||
|
||||
// set ORCID_PENDING to all orcid values that are not coming from ORCID provenance
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
|
@ -738,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
// Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result
|
||||
Map<String, Integer> clashing_orcid = new HashMap<>();
|
||||
|
||||
for (Author a : r.getAuthor()) {
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(
|
||||
p -> StringUtils
|
||||
.contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING))
|
||||
.map(StructuredProperty::getValue)
|
||||
.distinct()
|
||||
.forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1));
|
||||
}
|
||||
|
||||
Set<String> clashing = clashing_orcid
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(ee -> ee.getValue() > 1)
|
||||
.map(Map.Entry::getKey)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
// filter out clashing orcids
|
||||
for (Author a : r.getAuthor()) {
|
||||
a
|
||||
.setPid(
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(p -> !clashing.contains(p.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
|
@ -796,7 +844,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return author;
|
||||
}
|
||||
|
||||
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||
public static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||
return Optional
|
||||
.ofNullable(dateofacceptance)
|
||||
.map(Field::getValue)
|
||||
|
|
|
@ -175,7 +175,7 @@ public class IdentifierFactory implements Serializable {
|
|||
return entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.collect(
|
||||
Collectors
|
||||
|
@ -204,10 +204,11 @@ public class IdentifierFactory implements Serializable {
|
|||
.map(
|
||||
pp -> pp
|
||||
.stream()
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||
// given PID Type
|
||||
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
|
||||
.filter(CleaningFunctions::pidFilter))
|
||||
.orElse(Stream.empty());
|
||||
|
|
|
@ -96,7 +96,7 @@ public class MergeEntitiesComparator implements Comparator<Oaf> {
|
|||
// id
|
||||
if (res == 0) {
|
||||
if (left instanceof OafEntity && right instanceof OafEntity) {
|
||||
res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
|
||||
res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,24 +23,30 @@ import org.apache.commons.lang3.tuple.Pair;
|
|||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class MergeUtils {
|
||||
|
||||
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(s, oafEntityIterator, true);
|
||||
public static <T extends Oaf> T mergeById(Iterator<T> oafEntityIterator, VocabularyGroup vocs) {
|
||||
return mergeGroup(oafEntityIterator, true, vocs);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(s, oafEntityIterator, false);
|
||||
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator) {
|
||||
return mergeGroup(oafEntityIterator, false);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
||||
boolean checkDelegateAuthority) {
|
||||
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator, boolean checkDelegateAuthority) {
|
||||
return mergeGroup(oafEntityIterator, checkDelegateAuthority, null);
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T mergeGroup(Iterator<T> oafEntityIterator,
|
||||
boolean checkDelegateAuthority, VocabularyGroup vocs) {
|
||||
|
||||
ArrayList<T> sortedEntities = new ArrayList<>();
|
||||
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
||||
|
@ -49,13 +55,55 @@ public class MergeUtils {
|
|||
Iterator<T> it = sortedEntities.iterator();
|
||||
T merged = it.next();
|
||||
|
||||
while (it.hasNext()) {
|
||||
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||
if (!it.hasNext() && merged instanceof Result && vocs != null) {
|
||||
return enforceResultType(vocs, (Result) merged);
|
||||
} else {
|
||||
while (it.hasNext()) {
|
||||
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||
}
|
||||
}
|
||||
|
||||
return merged;
|
||||
}
|
||||
|
||||
private static <T extends Oaf> T enforceResultType(VocabularyGroup vocs, Result mergedResult) {
|
||||
if (Optional.ofNullable(mergedResult.getInstance()).map(List::isEmpty).orElse(true)) {
|
||||
return (T) mergedResult;
|
||||
} else {
|
||||
final Instance i = mergedResult.getInstance().get(0);
|
||||
|
||||
if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) {
|
||||
return (T) mergedResult;
|
||||
} else {
|
||||
final String expectedResultType = Optional
|
||||
.ofNullable(
|
||||
vocs
|
||||
.lookupTermBySynonym(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()))
|
||||
.orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE)
|
||||
.getClassid();
|
||||
|
||||
// there is a clash among the result types
|
||||
if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) {
|
||||
|
||||
Result result = (Result) Optional
|
||||
.ofNullable(ModelSupport.oafTypes.get(expectedResultType))
|
||||
.map(r -> {
|
||||
try {
|
||||
return r.newInstance();
|
||||
} catch (InstantiationException | IllegalAccessException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
})
|
||||
.orElse(new OtherResearchProduct());
|
||||
result.setId(mergedResult.getId());
|
||||
return (T) mergeResultFields(result, mergedResult);
|
||||
} else {
|
||||
return (T) mergedResult;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
||||
return (T) merge(left, right, checkDelegateAuthority);
|
||||
}
|
||||
|
@ -106,7 +154,7 @@ public class MergeUtils {
|
|||
return mergeSoftware((Software) left, (Software) right);
|
||||
}
|
||||
|
||||
return mergeResultFields((Result) left, (Result) right);
|
||||
return left;
|
||||
} else if (sameClass(left, right, Datasource.class)) {
|
||||
// TODO
|
||||
final int trust = compareTrust(left, right);
|
||||
|
@ -654,16 +702,9 @@ public class MergeUtils {
|
|||
}
|
||||
|
||||
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
|
||||
if (d1 == null || StringUtils.isBlank(d1.getValue())) {
|
||||
if (!GraphCleaningFunctions.cleanDateField(d1).isPresent()) {
|
||||
return d2;
|
||||
} else if (d2 == null || StringUtils.isBlank(d2.getValue())) {
|
||||
return d1;
|
||||
}
|
||||
|
||||
if (StringUtils.contains(d1.getValue(), "null")) {
|
||||
return d2;
|
||||
}
|
||||
if (StringUtils.contains(d2.getValue(), "null")) {
|
||||
} else if (!GraphCleaningFunctions.cleanDateField(d2).isPresent()) {
|
||||
return d1;
|
||||
}
|
||||
|
||||
|
@ -715,7 +756,11 @@ public class MergeUtils {
|
|||
private static String spKeyExtractor(StructuredProperty sp) {
|
||||
return Optional
|
||||
.ofNullable(sp)
|
||||
.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
|
||||
.map(
|
||||
s -> Joiner
|
||||
.on("||")
|
||||
.useForNull("")
|
||||
.join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
|
@ -972,7 +1017,7 @@ public class MergeUtils {
|
|||
private static String extractKeyFromPid(final StructuredProperty pid) {
|
||||
if (pid == null)
|
||||
return null;
|
||||
final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid);
|
||||
final StructuredProperty normalizedPid = PidCleaner.normalizePidValue(pid);
|
||||
|
||||
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
|
||||
}
|
||||
|
|
|
@ -1,6 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
||||
public class ModelHardLimits {
|
||||
|
||||
private ModelHardLimits() {
|
||||
|
@ -12,6 +18,7 @@ public class ModelHardLimits {
|
|||
|
||||
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
||||
public static final int MAX_AUTHORS = 200;
|
||||
public static final int MAX_RELATED_AUTHORS = 20;
|
||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||
public static final int MAX_TITLE_LENGTH = 5000;
|
||||
public static final int MAX_TITLES = 10;
|
||||
|
@ -19,6 +26,12 @@ public class ModelHardLimits {
|
|||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||
public static final int MAX_RELATED_ABSTRACT_LENGTH = 500;
|
||||
public static final int MAX_INSTANCES = 10;
|
||||
public static final Map<String, Long> MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500L);
|
||||
MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500L);
|
||||
}
|
||||
|
||||
public static String getCollectionName(String format) {
|
||||
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
||||
|
|
|
@ -26,7 +26,7 @@ public class PidCleaner {
|
|||
String value = Optional
|
||||
.ofNullable(pidValue)
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID (" + pidType + ") value cannot be empty"));
|
||||
|
||||
switch (pidType) {
|
||||
|
||||
|
|
|
@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
|
|||
if (right == null)
|
||||
return -1;
|
||||
|
||||
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
|
||||
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
|
||||
StructuredProperty l = PidCleaner.normalizePidValue(left);
|
||||
StructuredProperty r = PidCleaner.normalizePidValue(right);
|
||||
|
||||
return Optional
|
||||
.ofNullable(l.getValue())
|
||||
|
|
|
@ -28,6 +28,7 @@ import com.jayway.jsonpath.JsonPath;
|
|||
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
import net.minidev.json.JSONArray;
|
||||
import scala.collection.JavaConverters;
|
||||
import scala.collection.Seq;
|
||||
|
@ -104,7 +105,7 @@ public class DHPUtils {
|
|||
|
||||
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
|
||||
|
||||
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
|
||||
final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid);
|
||||
|
||||
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ class IdentifierFactoryTest {
|
|||
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
||||
"publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
|
||||
|
@ -41,7 +41,7 @@ class IdentifierFactoryTest {
|
|||
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
||||
"publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||
|
|
|
@ -179,7 +179,7 @@ class OafMapperUtilsTest {
|
|||
assertEquals(
|
||||
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||
((Result) MergeUtils
|
||||
.merge(p2, d1))
|
||||
.merge(p2, d1, true))
|
||||
.getResulttype()
|
||||
.getClassid());
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@
|
|||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
"value": "PMC21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
{
|
||||
"qualifier":{"classid":"pmc"},
|
||||
"value":"21459329"
|
||||
"value":"PMC21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction
|
|||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefixChain(cleanup(s), param("mod"));
|
||||
return suffixPrefixChain(cleanup(s), paramOrDefault("mod", 10));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
|
|
|
@ -90,7 +90,7 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
|||
inferFrom = normalize(inferFrom);
|
||||
inferFrom = filterAllStopWords(inferFrom);
|
||||
Set<String> cities = getCities(inferFrom, 4);
|
||||
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
|
||||
return citiesToCountry(cities).stream().filter(Objects::nonNull).findFirst().orElse("UNKNOWN");
|
||||
}
|
||||
|
||||
public static String cityInference(String original) {
|
||||
|
|
|
@ -54,6 +54,22 @@ public class FieldDef implements Serializable {
|
|||
public FieldDef() {
|
||||
}
|
||||
|
||||
public FieldDef clone() {
|
||||
FieldDef fieldDef = new FieldDef();
|
||||
fieldDef.setName(this.name);
|
||||
fieldDef.setPath(this.path);
|
||||
fieldDef.setType(this.type);
|
||||
fieldDef.setOverrideMatch(this.overrideMatch);
|
||||
fieldDef.setSize(this.size);
|
||||
fieldDef.setLength(this.length);
|
||||
fieldDef.setFilter(this.filter);
|
||||
fieldDef.setSorted(this.sorted);
|
||||
fieldDef.setClean(this.clean);
|
||||
fieldDef.setInfer(this.infer);
|
||||
fieldDef.setInferenceFrom(this.inferenceFrom);
|
||||
return fieldDef;
|
||||
}
|
||||
|
||||
public String getInferenceFrom() {
|
||||
return inferenceFrom;
|
||||
}
|
||||
|
|
|
@ -19,48 +19,10 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
|
|||
val model: SparkModel = SparkModel(conf)
|
||||
|
||||
val dedup: (Dataset[Row] => Dataset[Row]) = df => {
|
||||
df.transform(filterAndCleanup)
|
||||
.transform(generateClustersWithCollect)
|
||||
df.transform(generateClustersWithCollect)
|
||||
.transform(processBlocks)
|
||||
}
|
||||
|
||||
|
||||
val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
|
||||
val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
|
||||
if (conf.blacklists.containsKey(fdef.getName)) {
|
||||
res.withColumn(
|
||||
fdef.getName + "_filtered",
|
||||
filterColumnUDF(fdef).apply(new Column(fdef.getName))
|
||||
)
|
||||
} else {
|
||||
res
|
||||
}
|
||||
})
|
||||
|
||||
df_with_filters
|
||||
}
|
||||
|
||||
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
|
||||
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
|
||||
|
||||
if (blacklist == null) {
|
||||
throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
|
||||
} else {
|
||||
fdef.getType match {
|
||||
case Type.List | Type.JSON =>
|
||||
udf[Array[String], Array[String]](values => {
|
||||
values.filter((v: String) => !blacklist.test(v))
|
||||
})
|
||||
|
||||
case _ =>
|
||||
udf[String, String](v => {
|
||||
if (blacklist.test(v)) ""
|
||||
else v
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
|
||||
var df_with_clustering_keys: Dataset[Row] = null
|
||||
|
||||
|
|
|
@ -5,12 +5,12 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions
|
|||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
import org.apache.spark.sql.{Dataset, Row}
|
||||
|
||||
import java.util.Locale
|
||||
import java.util.function.Predicate
|
||||
import java.util.regex.Pattern
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
@ -29,8 +29,20 @@ case class SparkModel(conf: DedupConfig) {
|
|||
identifier.setName(identifierFieldName)
|
||||
identifier.setType(Type.String)
|
||||
|
||||
// create fields for blacklist
|
||||
val filtered = conf.getPace.getModel.asScala.flatMap(fdef => {
|
||||
if (conf.blacklists().containsKey(fdef.getName)) {
|
||||
val fdef_filtered = fdef.clone()
|
||||
fdef_filtered.setName(fdef.getName + "_filtered")
|
||||
Seq(fdef, fdef_filtered)
|
||||
}
|
||||
else {
|
||||
Seq(fdef)
|
||||
}
|
||||
})
|
||||
|
||||
// Construct a Spark StructType representing the schema of the model
|
||||
(Seq(identifier) ++ conf.getPace.getModel.asScala)
|
||||
(Seq(identifier) ++ filtered)
|
||||
.foldLeft(
|
||||
new StructType()
|
||||
)((resType, fieldDef) => {
|
||||
|
@ -44,7 +56,6 @@ case class SparkModel(conf: DedupConfig) {
|
|||
})
|
||||
})
|
||||
|
||||
|
||||
}
|
||||
|
||||
val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
|
||||
|
@ -52,7 +63,8 @@ case class SparkModel(conf: DedupConfig) {
|
|||
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||
|
||||
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||
df
|
||||
.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||
}
|
||||
|
||||
def rowFromJson(json: String): Row = {
|
||||
|
@ -64,41 +76,63 @@ case class SparkModel(conf: DedupConfig) {
|
|||
|
||||
schema.fieldNames.zipWithIndex.foldLeft(values) {
|
||||
case ((res, (fname, index))) =>
|
||||
val fdef = conf.getPace.getModelMap.get(fname)
|
||||
|
||||
val fdef = conf.getPace.getModelMap.get(fname.split("_filtered")(0))
|
||||
|
||||
if (fdef != null) {
|
||||
res(index) = fdef.getType match {
|
||||
case Type.String | Type.Int =>
|
||||
MapDocumentUtil.truncateValue(
|
||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
if (!fname.contains("_filtered")) { //process fields with no blacklist
|
||||
res(index) = fdef.getType match {
|
||||
case Type.String | Type.Int =>
|
||||
MapDocumentUtil.truncateValue(
|
||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
|
||||
case Type.URL =>
|
||||
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
||||
if (!URL_REGEX.matcher(uv).matches)
|
||||
uv = ""
|
||||
uv
|
||||
case Type.URL =>
|
||||
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
||||
if (!URL_REGEX.matcher(uv).matches)
|
||||
uv = ""
|
||||
uv
|
||||
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
).asScala
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
).asScala
|
||||
|
||||
case Type.StringConcat =>
|
||||
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||
case Type.StringConcat =>
|
||||
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||
|
||||
MapDocumentUtil.truncateValue(
|
||||
jpaths
|
||||
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
||||
.mkString(" "),
|
||||
fdef.getLength
|
||||
)
|
||||
MapDocumentUtil.truncateValue(
|
||||
jpaths
|
||||
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
||||
.mkString(" "),
|
||||
fdef.getLength
|
||||
)
|
||||
|
||||
case Type.DoubleArray =>
|
||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||
case Type.DoubleArray =>
|
||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||
}
|
||||
}
|
||||
else { //process fields with blacklist
|
||||
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
|
||||
|
||||
res(index) = fdef.getType match {
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
).asScala.filter((v: String) => !blacklist.test(v))
|
||||
|
||||
case _ =>
|
||||
val value: String = MapDocumentUtil.truncateValue(
|
||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
if (blacklist.test(value)) "" else value
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
val filter = fdef.getFilter
|
||||
|
||||
|
@ -125,13 +159,12 @@ case class SparkModel(conf: DedupConfig) {
|
|||
}
|
||||
|
||||
if (StringUtils.isNotBlank(fdef.getInfer)) {
|
||||
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
||||
val inferFrom: String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
||||
res(index) = res(index) match {
|
||||
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
|
||||
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
res
|
||||
|
@ -139,6 +172,7 @@ case class SparkModel(conf: DedupConfig) {
|
|||
}
|
||||
|
||||
new GenericRowWithSchema(values, schema)
|
||||
|
||||
}
|
||||
|
||||
def clean(value: String, cleantype: String) : String = {
|
||||
|
|
|
@ -21,7 +21,7 @@ public class CodeMatch extends AbstractStringComparator {
|
|||
public CodeMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
|
||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]+::\\d+"));
|
||||
}
|
||||
|
||||
public Set<String> getRegexList(String input) {
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.time.DateTimeException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.Period;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("dateRange")
|
||||
public class DateRange extends AbstractStringComparator {
|
||||
|
||||
int YEAR_RANGE;
|
||||
|
||||
public DateRange(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3"));
|
||||
}
|
||||
|
||||
public DateRange(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected DateRange(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
public static boolean isNumeric(String str) {
|
||||
return str.matches("\\d+"); // match a number with optional '-' and decimal.
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; // return -1 if a field is missing
|
||||
}
|
||||
|
||||
try {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
|
||||
LocalDate d1 = LocalDate.parse(a, formatter);
|
||||
LocalDate d2 = LocalDate.parse(b, formatter);
|
||||
Period period = Period.between(d1, d2);
|
||||
|
||||
return period.getYears() <= YEAR_RANGE ? 1.0 : 0.0;
|
||||
} catch (DateTimeException e) {
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -41,21 +41,38 @@ public class JsonListMatch extends AbstractListComparator {
|
|||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
switch (MODE) {
|
||||
case "count":
|
||||
return Sets.intersection(ca, cb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
case "percentage":
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
|
||||
case "type":
|
||||
Set<String> typesA = ca.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||
Set<String> typesB = cb.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||
|
||||
Set<String> types = Sets.intersection(typesA, typesB);
|
||||
|
||||
if (types.isEmpty()) // if no common type, it is impossible to compare
|
||||
return -1;
|
||||
|
||||
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||
cb = cb.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||
|
||||
return (double) Sets.intersection(ca, cb).size() / types.size();
|
||||
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (MODE.equals("percentage"))
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
|
||||
}
|
||||
|
||||
// converts every json into a comparable string basing on parameters
|
||||
|
@ -69,7 +86,7 @@ public class JsonListMatch extends AbstractListComparator {
|
|||
// for each path in the param list
|
||||
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||
String path = params.get(key);
|
||||
String value = MapDocumentUtil.getJPathString(path, documentContext);
|
||||
String value = MapDocumentUtil.getJPathString(path, documentContext).toLowerCase();
|
||||
if (value == null || value.isEmpty())
|
||||
value = "";
|
||||
st.append(value);
|
||||
|
|
|
@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
|
|||
// function for the evaluation of the node
|
||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||
|
||||
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
|
||||
// for each field in the node, it computes the
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
|
|
@ -9,11 +9,8 @@ public class TreeNodeStats implements Serializable {
|
|||
|
||||
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
||||
|
||||
private final boolean ignoreUndefined;
|
||||
|
||||
public TreeNodeStats(boolean ignoreUndefined) {
|
||||
public TreeNodeStats() {
|
||||
this.results = new HashMap<>();
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public Map<String, FieldStats> getResults() {
|
||||
|
@ -25,10 +22,7 @@ public class TreeNodeStats implements Serializable {
|
|||
}
|
||||
|
||||
public int fieldsCount() {
|
||||
if (ignoreUndefined)
|
||||
return this.results.size();
|
||||
else
|
||||
return this.results.size() - undefinedCount(); // do not count undefined
|
||||
return this.results.size();
|
||||
}
|
||||
|
||||
public int undefinedCount() {
|
||||
|
@ -84,22 +78,11 @@ public class TreeNodeStats implements Serializable {
|
|||
double min = 100.0; // random high value
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() < min) {
|
||||
if (fs.getResult() == -1) {
|
||||
if (fs.isCountIfUndefined()) {
|
||||
min = 0.0;
|
||||
} else {
|
||||
min = -1;
|
||||
}
|
||||
} else {
|
||||
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
||||
min = fs.getResult();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ignoreUndefined) {
|
||||
return min == -1.0 ? 0.0 : min;
|
||||
} else {
|
||||
return min;
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
// if at least one is true, return 1.0
|
||||
|
@ -108,11 +91,7 @@ public class TreeNodeStats implements Serializable {
|
|||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||
return 1.0;
|
||||
}
|
||||
if (!ignoreUndefined && undefinedCount() > 0) {
|
||||
return -1.0;
|
||||
} else {
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// if at least one is false, return 0.0
|
||||
|
@ -121,7 +100,7 @@ public class TreeNodeStats implements Serializable {
|
|||
|
||||
if (fieldStats.getResult() == -1) {
|
||||
if (fieldStats.isCountIfUndefined())
|
||||
return ignoreUndefined ? 0.0 : -1.0;
|
||||
return 0.0;
|
||||
} else {
|
||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||
return 0.0;
|
||||
|
|
|
@ -44,10 +44,12 @@ public class TreeProcessor {
|
|||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
treeStats.addNodeStats(nextNodeName, stats);
|
||||
|
||||
double finalScore = stats.getFinalScore(currentNode.getAggregation());
|
||||
if (finalScore == -1.0)
|
||||
// if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
||||
nextNodeName = currentNode.getUndefined();
|
||||
else if (finalScore >= currentNode.getThreshold()) {
|
||||
}
|
||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
nextNodeName = currentNode.getPositive();
|
||||
} else {
|
||||
nextNodeName = currentNode.getNegative();
|
||||
|
|
|
@ -227,4 +227,17 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNumAuthorsTitleSuffixPrefixChain() {
|
||||
|
||||
final ClusteringFunction cf = new NumAuthorsTitleSuffixPrefixChain(params);
|
||||
params.put("mod", 10);
|
||||
|
||||
final String title = "PARP-2 Regulates SIRT1 Expression and Whole-Body Energy Expenditure";
|
||||
final String num_authors = "10";
|
||||
System.out.println("title = " + title);
|
||||
System.out.println("num_authors = " + num_authors);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(num_authors, title)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
|
@ -54,8 +53,17 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
|||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test()
|
||||
public void countryInferenceTest_NPE() {
|
||||
assertThrows(
|
||||
NullPointerException.class,
|
||||
() -> countryInference("UNKNOWN", null),
|
||||
"Expected countryInference() to throw an NPE");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void countryInferenceTest() {
|
||||
assertEquals("UNKNOWN", countryInference("UNKNOWN", ""));
|
||||
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
|
||||
assertEquals("UK", countryInference("UK", "Università di Bologna"));
|
||||
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
|
||||
|
|
|
@ -65,6 +65,43 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void datasetVersionCodeMatchTest() {
|
||||
|
||||
params.put("codeRegex", "(?=[\\w-]*[a-zA-Z])(?=[\\w-]*\\d)[\\w-]+");
|
||||
CodeMatch codeMatch = new CodeMatch(params);
|
||||
|
||||
// names have different codes
|
||||
assertEquals(
|
||||
0.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998 ev02a",
|
||||
"physical oceanography at ctd station june 1998 ir02", conf));
|
||||
|
||||
// names have same code
|
||||
assertEquals(
|
||||
1.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998 ev02a",
|
||||
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||
|
||||
// code is not in both names
|
||||
assertEquals(
|
||||
-1,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998",
|
||||
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||
assertEquals(
|
||||
1.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998",
|
||||
conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void listContainsMatchTest() {
|
||||
|
||||
|
@ -257,15 +294,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
List<String> a = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
|
||||
"authors");
|
||||
List<String> b = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_2\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"isni\",\"classname\":\"ISNI Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"isni_1\"}"),
|
||||
"authors");
|
||||
|
||||
double result = jsonListMatch.compare(a, b, conf);
|
||||
|
@ -277,6 +314,13 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
result = jsonListMatch.compare(a, b, conf);
|
||||
|
||||
assertEquals(1.0, result);
|
||||
|
||||
params.put("mode", "type");
|
||||
jsonListMatch = new JsonListMatch(params);
|
||||
result = jsonListMatch.compare(a, b, conf);
|
||||
|
||||
assertEquals(0.5, result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -327,4 +371,34 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void dateMatch() {
|
||||
|
||||
DateRange dateRange = new DateRange(params);
|
||||
|
||||
double result = dateRange.distance("2021-05-13", "2023-05-13", conf);
|
||||
assertEquals(1.0, result);
|
||||
|
||||
result = dateRange.distance("2021-05-13", "2025-05-13", conf);
|
||||
assertEquals(0.0, result);
|
||||
|
||||
result = dateRange.distance("", "2020-05-05", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = dateRange.distance("invalid date", "2021-05-02", conf);
|
||||
assertEquals(-1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void titleVersionMatchTest() {
|
||||
|
||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||
|
||||
double result = titleVersionMatch
|
||||
.compare(
|
||||
"parp 2 regulates sirt 1 expression and whole body energy expenditure",
|
||||
"parp 2 regulates sirt 1 expression and whole body energy expenditure", conf);
|
||||
assertEquals(1.0, result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled;
|
|||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
|
||||
public class UtilTest {
|
||||
|
||||
|
|
|
@ -26,16 +26,16 @@
|
|||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-actionmanager</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-aggregation</artifactId>-->
|
||||
<!-- <artifactId>dhp-actionmanager</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-blacklist</artifactId>-->
|
||||
|
@ -56,61 +56,61 @@
|
|||
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-mapper</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-provision</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-impact-indicators</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-actionsets</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-promote</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-swh</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-raw-data-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-stats-build</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-mapper</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-provision</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-impact-indicators</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-actionsets</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-hist-snaps</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-monitor-irish</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-promote</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-swh</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-raw-data-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-stats-build</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -151,12 +151,17 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
SparkSession spark, String path, Class<G> rowClazz) {
|
||||
logger.info("Reading graph table from path: {}", path);
|
||||
|
||||
return spark
|
||||
.read()
|
||||
.textFile(path)
|
||||
.map(
|
||||
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
|
||||
Encoders.bean(rowClazz));
|
||||
if (HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(path)
|
||||
.map(
|
||||
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
|
||||
Encoders.bean(rowClazz));
|
||||
} else {
|
||||
logger.info("Found empty graph table from path: {}", path);
|
||||
return spark.emptyDataset(Encoders.bean(rowClazz));
|
||||
}
|
||||
}
|
||||
|
||||
private static <A extends Oaf> Dataset<A> readActionPayload(
|
||||
|
@ -223,7 +228,7 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
rowClazz,
|
||||
actionPayloadClazz);
|
||||
|
||||
if (shouldGroupById) {
|
||||
if (Boolean.TRUE.equals(shouldGroupById)) {
|
||||
return PromoteActionPayloadFunctions
|
||||
.groupGraphTableByIdAndMerge(
|
||||
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
|
||||
|
@ -250,6 +255,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation());
|
||||
case "eu.dnetlib.dhp.schema.oaf.Software":
|
||||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software());
|
||||
case "eu.dnetlib.dhp.schema.oaf.Person":
|
||||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Person());
|
||||
default:
|
||||
throw new RuntimeException("unknown class: " + clazz.getCanonicalName());
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ public class PromoteActionPayloadFunctions {
|
|||
PromoteAction.Strategy promoteActionStrategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz) {
|
||||
if (!isSubClass(rowClazz, actionPayloadClazz)) {
|
||||
if (Boolean.FALSE.equals(isSubClass(rowClazz, actionPayloadClazz))) {
|
||||
throw new RuntimeException(
|
||||
"action payload type must be the same or be a super type of table row type");
|
||||
}
|
||||
|
|
|
@ -7,3 +7,4 @@ promote_action_payload_for_project_table classpath eu/dnetlib/dhp/actionmanager/
|
|||
promote_action_payload_for_publication_table classpath eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app
|
||||
promote_action_payload_for_relation_table classpath eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app
|
||||
promote_action_payload_for_software_table classpath eu/dnetlib/dhp/actionmanager/wf/software/oozie_app
|
||||
promote_action_payload_for_person_table classpath eu/dnetlib/dhp/actionmanager/wf/person/oozie_app
|
||||
|
|
|
@ -135,21 +135,10 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/action_payload_by_type</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="ForkPromote"/>
|
||||
<ok to="PromoteActionPayloadForDatasetTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="ForkPromote">
|
||||
<path start="PromoteActionPayloadForDatasetTable"/>
|
||||
<path start="PromoteActionPayloadForDatasourceTable"/>
|
||||
<path start="PromoteActionPayloadForOrganizationTable"/>
|
||||
<path start="PromoteActionPayloadForOtherResearchProductTable"/>
|
||||
<path start="PromoteActionPayloadForProjectTable"/>
|
||||
<path start="PromoteActionPayloadForPublicationTable"/>
|
||||
<path start="PromoteActionPayloadForRelationTable"/>
|
||||
<path start="PromoteActionPayloadForSoftwareTable"/>
|
||||
</fork>
|
||||
|
||||
<action name="PromoteActionPayloadForDatasetTable">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/promote_action_payload_for_dataset_table</app-path>
|
||||
|
@ -161,7 +150,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForDatasourceTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -176,7 +165,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForOrganizationTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -191,7 +180,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForOtherResearchProductTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -206,7 +195,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForProjectTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -221,7 +210,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForPublicationTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -236,7 +225,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForRelationTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -251,7 +240,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForSoftwareTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -266,11 +255,9 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="JoinPromote" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,129 @@
|
|||
<workflow-app name="promote_action_payload_for_person_table" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>activePromotePersonActionPayload</name>
|
||||
<description>when true will promote actions with eu.dnetlib.dhp.schema.oaf.Person payload</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>inputGraphRootPath</name>
|
||||
<description>root location of input materialized graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>inputActionPayloadRootPath</name>
|
||||
<description>root location of action payloads to promote</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputGraphRootPath</name>
|
||||
<description>root location for output materialized graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mergeAndGetStrategy</name>
|
||||
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="DecisionPromotePersonActionPayload"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="DecisionPromotePersonActionPayload">
|
||||
<switch>
|
||||
<case to="PromotePersonActionPayloadForPersonTable">
|
||||
${(activePromotePersonActionPayload eq "true") and
|
||||
(fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputActionPayloadRootPath')),'/'),'clazz=eu.dnetlib.dhp.schema.oaf.Person')) eq "true")}
|
||||
</case>
|
||||
<default to="SkipPromotePersonActionPayloadForPersonTable"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="PromotePersonActionPayloadForPersonTable">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PromotePersonActionPayloadForPersonTable</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob</class>
|
||||
<jar>dhp-actionmanager-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/person</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||
<arg>--inputActionPayloadPath</arg><arg>${inputActionPayloadRootPath}/clazz=eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/person</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="SkipPromotePersonActionPayloadForPersonTable">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${outputGraphRootPath}/person"/>
|
||||
</prepare>
|
||||
<arg>-pb</arg>
|
||||
<arg>${inputGraphRootPath}/person</arg>
|
||||
<arg>${outputGraphRootPath}/person</arg>
|
||||
</distcp>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -110,6 +110,11 @@
|
|||
<artifactId>commons-compress</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-csv</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
|
|
|
@ -13,6 +13,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
|
|
@ -10,7 +10,6 @@ import java.util.List;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
|
@ -29,12 +28,13 @@ import eu.dnetlib.dhp.schema.action.AtomicAction;
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.DoiCleaningRule;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Creates action sets for Crossref affiliation relations inferred by BIP!
|
||||
* Creates action sets for Crossref affiliation relations inferred by OpenAIRE
|
||||
*/
|
||||
public class PrepareAffiliationRelations implements Serializable {
|
||||
|
||||
|
@ -44,6 +44,10 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
||||
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
||||
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
|
||||
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
|
||||
public static final String DOI_URL_PREFIX = "https://doi.org/";
|
||||
public static final int DOI_URL_PREFIX_LENGTH = 16;
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
|
@ -74,6 +78,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||
|
||||
final String publisherInputPath = parser.get("publisherInputPath");
|
||||
log.info("publisherInputPath: {}", publisherInputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
|
@ -84,46 +91,80 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Constants.removeOutputDir(spark, outputPath);
|
||||
|
||||
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
||||
spark, crossrefInputPath, collectedFromCrossref);
|
||||
|
||||
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedFromPubmed);
|
||||
|
||||
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
||||
spark, openapcInputPath, collectedFromOpenAPC);
|
||||
|
||||
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedFromDatacite);
|
||||
|
||||
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.union(webCrawlRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
|
||||
createActionSet(
|
||||
spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
|
||||
publisherInputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
|
||||
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
|
||||
String outputPath) {
|
||||
List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
|
||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":crossref");
|
||||
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":pubmed");
|
||||
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":openapc");
|
||||
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":datacite");
|
||||
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":rawaff");
|
||||
|
||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
|
||||
spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":webcrawl");
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.union(webCrawlRelations)
|
||||
.union(publisherRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom,
|
||||
String dataprovenance) {
|
||||
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema(
|
||||
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
|
||||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDDNew(
|
||||
collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
|
||||
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
||||
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
||||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(
|
||||
collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||
|
||||
// load and parse affiliation relations from HDFS
|
||||
Dataset<Row> df = spark
|
||||
|
@ -132,6 +173,25 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df, dataprovenance);
|
||||
}
|
||||
|
||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||
// load and parse affiliation relations from HDFS
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema(
|
||||
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
|
||||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDDNew(collectedfrom, df, dataprovenance);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df,
|
||||
String dataprovenance) {
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
|
@ -147,7 +207,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
|
||||
// DOI to OpenAIRE id
|
||||
final String paperId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
||||
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
|
||||
|
||||
// ROR id to OpenAIRE id
|
||||
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
||||
|
@ -163,7 +223,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
BIP_INFERENCE_PROVENANCE,
|
||||
dataprovenance,
|
||||
true,
|
||||
false,
|
||||
qualifier,
|
||||
|
@ -179,6 +239,70 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df,
|
||||
String dataprovenance) {
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
.select(
|
||||
new Column("DOI").as("doi"),
|
||||
new Column("matching.PID").as("pidtype"),
|
||||
new Column("matching.Value").as("pidvalue"),
|
||||
new Column("matching.Confidence").as("confidence"),
|
||||
new Column("matching.Status").as("status"))
|
||||
.where("status = 'active'");
|
||||
|
||||
// prepare action sets for affiliation relations
|
||||
return df
|
||||
.toJavaRDD()
|
||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||
|
||||
// DOI to OpenAIRE id
|
||||
final String paperId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
|
||||
|
||||
// Organization to OpenAIRE identifier
|
||||
String affId = null;
|
||||
if (row.getAs("pidtype").equals("ROR"))
|
||||
// ROR id to OpenIARE id
|
||||
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
|
||||
else
|
||||
// getting the OpenOrgs identifier for the organization
|
||||
affId = row.getAs("pidvalue");
|
||||
|
||||
Qualifier qualifier = OafMapperUtils
|
||||
.qualifier(
|
||||
BIP_AFFILIATIONS_CLASSID,
|
||||
BIP_AFFILIATIONS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
||||
|
||||
// format data info; setting `confidence` into relation's `trust`
|
||||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
dataprovenance,
|
||||
true,
|
||||
false,
|
||||
qualifier,
|
||||
Double.toString(row.getAs("confidence")));
|
||||
|
||||
// return bi-directional relations
|
||||
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
||||
|
||||
})
|
||||
.map(p -> new AtomicAction(Relation.class, p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||
}
|
||||
|
||||
private static String removePrefix(String doi) {
|
||||
if (doi.startsWith(DOI_URL_PREFIX))
|
||||
return doi.substring(DOI_URL_PREFIX_LENGTH);
|
||||
return doi;
|
||||
}
|
||||
|
||||
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
||||
DataInfo dataInfo) {
|
||||
return Arrays
|
||||
|
|
|
@ -46,6 +46,9 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String backupPath = parser.get("backupPath");
|
||||
log.info("backupPath {}", backupPath);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
|
@ -53,11 +56,11 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
|
||||
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
|
||||
|
||||
ocr.doExtract(inputPath, outputPath, fileSystem);
|
||||
ocr.doExtract(inputPath, outputPath, backupPath, fileSystem);
|
||||
|
||||
}
|
||||
|
||||
private void doExtract(String inputPath, String outputPath, FileSystem fileSystem)
|
||||
private void doExtract(String inputPath, String outputPath, String backupPath, FileSystem fileSystem)
|
||||
throws IOException {
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
|
@ -89,6 +92,7 @@ public class GetOpenCitationsRefs implements Serializable {
|
|||
}
|
||||
|
||||
}
|
||||
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -107,7 +107,8 @@ public class ReadCOCI implements Serializable {
|
|||
.mode(SaveMode.Append)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
|
||||
|
||||
fileSystem.delete(fileStatus.getPath());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,25 +2,34 @@
|
|||
package eu.dnetlib.dhp.actionmanager.personentity;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.*;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.spark_project.jetty.util.StringUtil;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
|
@ -28,13 +37,14 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.collection.orcid.model.Author;
|
||||
import eu.dnetlib.dhp.collection.orcid.model.Employment;
|
||||
import eu.dnetlib.dhp.collection.orcid.model.Work;
|
||||
import eu.dnetlib.dhp.common.DbClient;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.common.person.CoAuthorshipIterator;
|
||||
import eu.dnetlib.dhp.common.person.Coauthors;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Person;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
@ -44,7 +54,7 @@ import scala.Tuple2;
|
|||
|
||||
public class ExtractPerson implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
|
||||
|
||||
private static final String QUERY = "SELECT * FROM project_person WHERE pid_type = 'ORCID'";
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final String OPENAIRE_PREFIX = "openaire____";
|
||||
private static final String SEPARATOR = "::";
|
||||
|
@ -58,9 +68,48 @@ public class ExtractPerson implements Serializable {
|
|||
|
||||
private static final String PMCID_PREFIX = "50|pmcid_______::";
|
||||
private static final String ROR_PREFIX = "20|ror_________::";
|
||||
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
|
||||
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class)
|
||||
+ IdentifierFactory.ID_PREFIX_SEPARATOR + ModelConstants.ORCID + "_______";
|
||||
private static final String PROJECT_ID_PREFIX = ModelSupport.getIdPrefix(Project.class)
|
||||
+ IdentifierFactory.ID_PREFIX_SEPARATOR;
|
||||
|
||||
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
|
||||
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
|
||||
public static final String FUNDER_AUTHORS_CLASSID = "sysimport:crosswalk:funderdatabase";
|
||||
public static final String FUNDER_AUTHORS_CLASSNAME = "Imported from Funder Database";
|
||||
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
|
||||
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
|
||||
|
||||
public static List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
|
||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
|
||||
public static final DataInfo ORCIDDATAINFO = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ORCID_AUTHORS_CLASSID,
|
||||
ORCID_AUTHORS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91");
|
||||
|
||||
public static final DataInfo FUNDERDATAINFO = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
FUNDER_AUTHORS_CLASSID,
|
||||
FUNDER_AUTHORS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91");
|
||||
|
||||
public static void main(final String[] args) throws IOException, ParseException {
|
||||
|
||||
|
@ -91,19 +140,130 @@ public class ExtractPerson implements Serializable {
|
|||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir {}", workingDir);
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
||||
createActionSet(spark, inputPath, outputPath, workingDir);
|
||||
extractInfoForActionSetFromORCID(spark, inputPath, workingDir);
|
||||
extractInfoForActionSetFromProjects(
|
||||
spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode);
|
||||
createActionSet(spark, outputPath, workingDir);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
|
||||
private static void extractInfoForActionSetFromProjects(SparkSession spark, String inputPath, String workingDir,
|
||||
String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException {
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
Path hdfsWritePath = new Path(hdfsPath);
|
||||
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
|
||||
dbClient.processResults(QUERY, rs -> writeRelation(getRelationWithProject(rs), writer));
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static Relation getRelationWithProject(ResultSet rs) {
|
||||
try {
|
||||
return getProjectRelation(
|
||||
rs.getString("project"), rs.getString("pid"),
|
||||
rs.getString("role"));
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Relation getProjectRelation(String project, String orcid, String role) {
|
||||
|
||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
|
||||
String target = PROJECT_ID_PREFIX + StringUtils.substringBefore(project, "::") + "::"
|
||||
+ IdentifierFactory.md5(StringUtils.substringAfter(project, "::"));
|
||||
List<KeyValue> properties = new ArrayList<>();
|
||||
|
||||
Relation relation = OafMapperUtils
|
||||
.getRelation(
|
||||
source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE,
|
||||
ModelConstants.PROJECT_PERSON_PARTICIPATES,
|
||||
collectedfromOpenAIRE,
|
||||
FUNDERDATAINFO,
|
||||
null);
|
||||
relation.setValidated(true);
|
||||
|
||||
if (StringUtils.isNotBlank(role)) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("role");
|
||||
kv.setValue(role);
|
||||
properties.add(kv);
|
||||
}
|
||||
|
||||
if (!properties.isEmpty())
|
||||
relation.setProperties(properties);
|
||||
return relation;
|
||||
|
||||
}
|
||||
|
||||
protected static void writeRelation(final Relation relation, BufferedWriter writer) {
|
||||
try {
|
||||
writer.write(OBJECT_MAPPER.writeValueAsString(relation));
|
||||
writer.newLine();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static void createActionSet(SparkSession spark, String outputPath, String workingDir) {
|
||||
|
||||
Dataset<Person> people;
|
||||
people = spark
|
||||
.read()
|
||||
.textFile(workingDir + "/people")
|
||||
.map(
|
||||
(MapFunction<String, Person>) value -> OBJECT_MAPPER
|
||||
.readValue(value, Person.class),
|
||||
Encoders.bean(Person.class));
|
||||
|
||||
people
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p))
|
||||
.union(
|
||||
getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
|
||||
.union(
|
||||
getRelations(spark, workingDir + "/coauthorship")
|
||||
.toJavaRDD()
|
||||
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||
.union(
|
||||
getRelations(spark, workingDir + "/affiliation")
|
||||
.toJavaRDD()
|
||||
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||
.union(
|
||||
getRelations(spark, workingDir + "/project")
|
||||
.toJavaRDD()
|
||||
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
}
|
||||
|
||||
private static void extractInfoForActionSetFromORCID(SparkSession spark, String inputPath, String workingDir) {
|
||||
Dataset<Author> authors = spark
|
||||
.read()
|
||||
.parquet(inputPath + "Authors")
|
||||
|
@ -129,18 +289,13 @@ public class ExtractPerson implements Serializable {
|
|||
.parquet(inputPath + "Employments")
|
||||
.as(Encoders.bean(Employment.class));
|
||||
|
||||
Dataset<Author> peopleToMap = authors
|
||||
.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
|
||||
.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
|
||||
.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
|
||||
|
||||
Dataset<Employment> employment = employmentDataset
|
||||
.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
|
||||
.joinWith(authors, employmentDataset.col("orcid").equalTo(authors.col("orcid")))
|
||||
.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
|
||||
|
||||
Dataset<Person> people;
|
||||
peopleToMap.map((MapFunction<Author, Person>) op -> {
|
||||
// Mapping all the orcid profiles even if the profile has no visible works
|
||||
|
||||
authors.map((MapFunction<Author, Person>) op -> {
|
||||
Person person = new Person();
|
||||
person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
|
||||
person
|
||||
|
@ -190,9 +345,23 @@ public class ExtractPerson implements Serializable {
|
|||
OafMapperUtils
|
||||
.structuredProperty(
|
||||
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
|
||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES),
|
||||
"0.91")));
|
||||
person.setDateofcollection(op.getLastModifiedDate());
|
||||
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
||||
person.setDataInfo(ORCIDDATAINFO);
|
||||
return person;
|
||||
}, Encoders.bean(Person.class))
|
||||
.write()
|
||||
|
@ -246,34 +415,6 @@ public class ExtractPerson implements Serializable {
|
|||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(workingDir + "/affiliation");
|
||||
|
||||
people = spark
|
||||
.read()
|
||||
.textFile(workingDir + "/people")
|
||||
.map(
|
||||
(MapFunction<String, Person>) value -> OBJECT_MAPPER
|
||||
.readValue(value, Person.class),
|
||||
Encoders.bean(Person.class));
|
||||
|
||||
people.show(false);
|
||||
people
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(p.getClass(), p))
|
||||
.union(
|
||||
getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
|
||||
.union(
|
||||
getRelations(spark, workingDir + "/coauthorship")
|
||||
.toJavaRDD()
|
||||
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||
.union(
|
||||
getRelations(spark, workingDir + "/affiliation")
|
||||
.toJavaRDD()
|
||||
.map(r -> new AtomicAction(r.getClass(), r)))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
}
|
||||
|
||||
private static Dataset<Relation> getRelations(SparkSession spark, String path) {
|
||||
|
@ -297,7 +438,7 @@ public class ExtractPerson implements Serializable {
|
|||
}
|
||||
|
||||
private static Relation getAffiliationRelation(Employment row) {
|
||||
String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
|
||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(row.getOrcid());
|
||||
String target = ROR_PREFIX
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
|
||||
List<KeyValue> properties = new ArrayList<>();
|
||||
|
@ -307,23 +448,17 @@ public class ExtractPerson implements Serializable {
|
|||
source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
|
||||
ModelConstants.ORG_PERSON_PARTICIPATES,
|
||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91"),
|
||||
ORCIDDATAINFO,
|
||||
null);
|
||||
relation.setValidated(true);
|
||||
|
||||
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
|
||||
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("startDate");
|
||||
kv.setValue(row.getStartDate());
|
||||
properties.add(kv);
|
||||
}
|
||||
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
|
||||
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("endDate");
|
||||
kv.setValue(row.getEndDate());
|
||||
|
@ -336,45 +471,6 @@ public class ExtractPerson implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
|
||||
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
|
||||
String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
|
||||
|
||||
return Arrays
|
||||
.asList(
|
||||
OafMapperUtils
|
||||
.getRelation(
|
||||
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
|
||||
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
||||
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
|
||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91"),
|
||||
null),
|
||||
OafMapperUtils
|
||||
.getRelation(
|
||||
target, source, ModelConstants.PERSON_PERSON_RELTYPE,
|
||||
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
||||
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
|
||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91"),
|
||||
null));
|
||||
|
||||
}
|
||||
|
||||
private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
|
||||
|
||||
if (Optional.ofNullable(w.getPids()).isPresent())
|
||||
|
@ -417,21 +513,15 @@ public class ExtractPerson implements Serializable {
|
|||
default:
|
||||
return null;
|
||||
}
|
||||
|
||||
return OafMapperUtils
|
||||
Relation relation = OafMapperUtils
|
||||
.getRelation(
|
||||
source, target, ModelConstants.RESULT_PERSON_RELTYPE,
|
||||
ModelConstants.RESULT_PERSON_SUBRELTYPE,
|
||||
ModelConstants.RESULT_PERSON_HASAUTHORED,
|
||||
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91"),
|
||||
ORCIDDATAINFO,
|
||||
null);
|
||||
relation.setValidated(true);
|
||||
return relation;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,203 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID;
|
||||
import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME;
|
||||
import static eu.dnetlib.dhp.common.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateRAiDActionSetJob {
|
||||
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final List<KeyValue> RAID_COLLECTED_FROM = listKeyValues(
|
||||
OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
|
||||
private static final Qualifier RAID_QUALIFIER = qualifier(
|
||||
"0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE);
|
||||
|
||||
private static final Qualifier RAID_INFERENCE_QUALIFIER = qualifier(
|
||||
"raid:openaireinference", "Inferred by OpenAIRE", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS);
|
||||
|
||||
private static final DataInfo RAID_DATA_INFO = dataInfo(
|
||||
false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_INFERENCE_QUALIFIER, "0.92");
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
processRAiDEntities(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void removeOutputDir(final SparkSession spark, final String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
static void processRAiDEntities(final SparkSession spark,
|
||||
final String inputPath,
|
||||
final String outputPath) {
|
||||
readInputPath(spark, inputPath)
|
||||
.map(GenerateRAiDActionSetJob::prepareRAiD)
|
||||
.flatMap(List::iterator)
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
|
||||
}
|
||||
|
||||
protected static List<AtomicAction<? extends Oaf>> prepareRAiD(final RAiDEntity r) {
|
||||
|
||||
final Date now = new Date();
|
||||
final OtherResearchProduct orp = new OtherResearchProduct();
|
||||
final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
|
||||
String raidId = calculateOpenaireId(r.getRaid());
|
||||
|
||||
orp.setId(raidId);
|
||||
orp.setCollectedfrom(RAID_COLLECTED_FROM);
|
||||
orp.setDataInfo(RAID_DATA_INFO);
|
||||
orp
|
||||
.setTitle(
|
||||
Collections
|
||||
.singletonList(
|
||||
structuredProperty(
|
||||
r.getTitle(),
|
||||
qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE),
|
||||
RAID_DATA_INFO)));
|
||||
orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary()));
|
||||
|
||||
Instance instance = new Instance();
|
||||
instance.setInstancetype(RAID_QUALIFIER);
|
||||
orp.setInstance(Collections.singletonList(instance));
|
||||
orp
|
||||
.setSubject(
|
||||
r
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.map(
|
||||
s -> subject(
|
||||
s,
|
||||
qualifier(
|
||||
DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES,
|
||||
DNET_SUBJECT_TYPOLOGIES),
|
||||
RAID_DATA_INFO))
|
||||
.collect(Collectors.toList()));
|
||||
orp
|
||||
.setRelevantdate(
|
||||
Arrays
|
||||
.asList(
|
||||
structuredProperty(
|
||||
r.getEndDate(), qualifier(END_DATE, END_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE),
|
||||
RAID_DATA_INFO),
|
||||
structuredProperty(
|
||||
r.getStartDate(),
|
||||
qualifier(START_DATE, START_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE),
|
||||
RAID_DATA_INFO)));
|
||||
orp.setLastupdatetimestamp(now.getTime());
|
||||
orp.setDateofacceptance(field(r.getStartDate(), RAID_DATA_INFO));
|
||||
|
||||
res.add(new AtomicAction<>(OtherResearchProduct.class, orp));
|
||||
|
||||
for (String resultId : r.getIds()) {
|
||||
Relation rel1 = OafMapperUtils
|
||||
.getRelation(
|
||||
raidId,
|
||||
resultId,
|
||||
ModelConstants.RESULT_RESULT,
|
||||
PART,
|
||||
HAS_PART,
|
||||
orp);
|
||||
Relation rel2 = OafMapperUtils
|
||||
.getRelation(
|
||||
resultId,
|
||||
raidId,
|
||||
ModelConstants.RESULT_RESULT,
|
||||
PART,
|
||||
IS_PART_OF,
|
||||
orp);
|
||||
res.add(new AtomicAction<>(Relation.class, rel1));
|
||||
res.add(new AtomicAction<>(Relation.class, rel2));
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static String calculateOpenaireId(final String raid) {
|
||||
return String.format("50|%s::%s", RAID_NS_PREFIX, DHPUtils.md5(raid));
|
||||
}
|
||||
|
||||
public static List<Author> createAuthors(final List<String> author) {
|
||||
return author.stream().map(s -> {
|
||||
Author a = new Author();
|
||||
a.setFullname(s);
|
||||
return a;
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static JavaRDD<RAiDEntity> readInputPath(
|
||||
final SparkSession spark,
|
||||
final String path) {
|
||||
|
||||
return spark
|
||||
.read()
|
||||
.json(path)
|
||||
.as(Encoders.bean(RAiDEntity.class))
|
||||
.toJavaRDD();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid.model;
|
||||
|
||||
public class GenerateRAiDActionSetJob {
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class RAiDEntity implements Serializable {
|
||||
|
||||
String raid;
|
||||
List<String> authors;
|
||||
String startDate;
|
||||
String endDate;
|
||||
List<String> subjects;
|
||||
List<String> titles;
|
||||
List<String> ids;
|
||||
String title;
|
||||
String summary;
|
||||
|
||||
public RAiDEntity() {
|
||||
}
|
||||
|
||||
public RAiDEntity(String raid, List<String> authors, String startDate, String endDate, List<String> subjects,
|
||||
List<String> titles, List<String> ids, String title, String summary) {
|
||||
this.raid = raid;
|
||||
this.authors = authors;
|
||||
this.startDate = startDate;
|
||||
this.endDate = endDate;
|
||||
this.subjects = subjects;
|
||||
this.titles = titles;
|
||||
this.ids = ids;
|
||||
this.title = title;
|
||||
this.summary = summary;
|
||||
}
|
||||
|
||||
public String getRaid() {
|
||||
return raid;
|
||||
}
|
||||
|
||||
public void setRaid(String raid) {
|
||||
this.raid = raid;
|
||||
}
|
||||
|
||||
public List<String> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<String> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public String getStartDate() {
|
||||
return startDate;
|
||||
}
|
||||
|
||||
public void setStartDate(String startDate) {
|
||||
this.startDate = startDate;
|
||||
}
|
||||
|
||||
public String getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public void setEndDate(String endDate) {
|
||||
this.endDate = endDate;
|
||||
}
|
||||
|
||||
public List<String> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(List<String> subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public List<String> getTitles() {
|
||||
return titles;
|
||||
}
|
||||
|
||||
public void setTitles(List<String> titles) {
|
||||
this.titles = titles;
|
||||
}
|
||||
|
||||
public List<String> getIds() {
|
||||
return ids;
|
||||
}
|
||||
|
||||
public void setIds(List<String> ids) {
|
||||
this.ids = ids;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getSummary() {
|
||||
return summary;
|
||||
}
|
||||
|
||||
public void setSummary(String summary) {
|
||||
this.summary = summary;
|
||||
}
|
||||
}
|
|
@ -44,13 +44,7 @@ import eu.dnetlib.dhp.common.Constants;
|
|||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
|
|
@ -20,6 +20,8 @@ import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
|
|||
import eu.dnetlib.dhp.aggregation.common.ReportingJob;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.csv.FileCsvCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.csv.HttpCsvCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.gtr2.Gtr2PublicationsCollectorPlugin;
|
||||
|
@ -28,6 +30,7 @@ import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
|
|||
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.osf.OsfPreprintsCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.zenodo.CollectZenodoDumpCollectorPlugin;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
@ -126,6 +129,12 @@ public class CollectorWorker extends ReportingJob {
|
|||
return new Gtr2PublicationsCollectorPlugin(this.clientParams);
|
||||
case osfPreprints:
|
||||
return new OsfPreprintsCollectorPlugin(this.clientParams);
|
||||
case zenodoDump:
|
||||
return new CollectZenodoDumpCollectorPlugin();
|
||||
case fileCSV:
|
||||
return new FileCsvCollectorPlugin(this.fileSystem);
|
||||
case httpCSV:
|
||||
return new HttpCsvCollectorPlugin(this.clientParams, this.fileSystem);
|
||||
case other:
|
||||
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
|
||||
.ofNullable(this.api.getParams().get("other_plugin_type"))
|
||||
|
|
|
@ -154,7 +154,6 @@ public class ORCIDExtractor extends Thread {
|
|||
extractedItem++;
|
||||
if (extractedItem % 100000 == 0) {
|
||||
log.info("Thread {}: Extracted {} items", id, extractedItem);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,10 +11,21 @@ public interface CollectorPlugin {
|
|||
|
||||
enum NAME {
|
||||
|
||||
oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints;
|
||||
oai,
|
||||
other,
|
||||
rest_json2xml,
|
||||
file,
|
||||
fileGzip,
|
||||
baseDump,
|
||||
gtr2Publications,
|
||||
osfPreprints,
|
||||
zenodoDump,
|
||||
fileCSV,
|
||||
httpCSV;
|
||||
|
||||
public enum OTHER_NAME {
|
||||
mdstore_mongodb_dump, mdstore_mongodb
|
||||
mdstore_mongodb_dump,
|
||||
mdstore_mongodb
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.csv;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Iterator;
|
||||
import java.util.Optional;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.commons.lang3.BooleanUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class FileCsvCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(FileCsvCollectorPlugin.class);
|
||||
|
||||
private final FileSystem fileSystem;
|
||||
|
||||
public FileCsvCollectorPlugin(final FileSystem fileSystem) {
|
||||
this.fileSystem = fileSystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
||||
|
||||
final Path filePath = Optional
|
||||
.ofNullable(api.getBaseUrl())
|
||||
.map(Path::new)
|
||||
.orElseThrow(() -> new CollectorException("missing baseUrl"));
|
||||
|
||||
final boolean withHeaders = BooleanUtils.toBoolean(api.getParams().get("header"));
|
||||
final String separator = api.getParams().get("separator");
|
||||
final int identifierNumber = NumberUtils.toInt(api.getParams().get("identifier"), 0);
|
||||
final String quote = api.getParams().get("quote");
|
||||
|
||||
final String[] headers;
|
||||
|
||||
try (InputStream is = this.fileSystem.open(filePath);
|
||||
BOMInputStream bomis = new BOMInputStream(is);
|
||||
InputStreamReader isr = new InputStreamReader(bomis);
|
||||
BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
if (withHeaders) {
|
||||
final String[] tmpHeader = br.readLine().split(separator);
|
||||
if (StringUtils.isNotBlank(quote)) {
|
||||
int i = 0;
|
||||
headers = new String[tmpHeader.length];
|
||||
for (final String h : tmpHeader) {
|
||||
headers[i] = StringUtils.strip(h, quote);
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
headers = tmpHeader;
|
||||
}
|
||||
} else {
|
||||
headers = null;
|
||||
}
|
||||
|
||||
final Iterator<String> iterator = new Iterator<String>() {
|
||||
|
||||
private String next = calculateNext();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return this.next != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
try {
|
||||
return new String(this.next);
|
||||
} finally {
|
||||
this.next = calculateNext();
|
||||
}
|
||||
}
|
||||
|
||||
private String calculateNext() {
|
||||
try {
|
||||
final Document document = DocumentHelper.createDocument();
|
||||
final Element root = document.addElement("csvRecord");
|
||||
|
||||
String newLine = br.readLine();
|
||||
|
||||
// FIX: FOR SOME FILES IT RETURN NULL ALSO IF THE FILE IS NOT READY DONE
|
||||
if (newLine == null) {
|
||||
newLine = br.readLine();
|
||||
}
|
||||
// END FIX
|
||||
|
||||
if (newLine != null) {
|
||||
final String[] currentRow = StringUtils.split(newLine, separator);
|
||||
|
||||
if (currentRow != null) {
|
||||
|
||||
for (int i = 0; i < currentRow.length; i++) {
|
||||
final String hAttribute = (headers != null) && (i < headers.length) ? headers[i] : "column" + i;
|
||||
|
||||
final Element row = root.addElement("column");
|
||||
if (i == identifierNumber) {
|
||||
row.addAttribute("isID", "true");
|
||||
}
|
||||
final String value = StringUtils.isBlank(quote) ? currentRow[i] : StringUtils.strip(currentRow[i], quote);
|
||||
|
||||
row.addAttribute("name", hAttribute).addText(value);
|
||||
}
|
||||
return document.asXML();
|
||||
}
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
log.error("Error calculating next csv element", e);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
|
||||
|
||||
return StreamSupport.stream(spliterator, false);
|
||||
|
||||
} catch (final Throwable e) {
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.csv;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.csv.CSVFormat;
|
||||
import org.apache.commons.csv.CSVParser;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.Iterators;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class HttpCsvCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(HttpCsvCollectorPlugin.class);
|
||||
|
||||
private final FileSystem fileSystem;
|
||||
|
||||
private final HttpConnector2 httpConnector;
|
||||
|
||||
public HttpCsvCollectorPlugin(final HttpClientParams clientParams, final FileSystem fileSystem) {
|
||||
this.httpConnector = new HttpConnector2(clientParams);
|
||||
this.fileSystem = fileSystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
|
||||
|
||||
final String baseUrl = api.getBaseUrl();
|
||||
|
||||
final String separator = api.getParams().get("separator");
|
||||
final String identifier = api.getParams().get("identifier");
|
||||
final String quote = api.getParams().get("quote");
|
||||
|
||||
long nLines = 0;
|
||||
|
||||
try {
|
||||
// FIX
|
||||
// This code should skip the lines with invalid quotes
|
||||
final Path tempPath = new Path("/tmp/" + DHPUtils.md5(baseUrl) + ".csv.tmp");
|
||||
|
||||
try (InputStream is = this.httpConnector.getInputSourceAsStream(baseUrl);
|
||||
BOMInputStream bomIs = new BOMInputStream(is);
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs));
|
||||
FSDataOutputStream fsdos = this.fileSystem.create(tempPath, true);
|
||||
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fsdos, StandardCharsets.UTF_8))) {
|
||||
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (StringUtils.isBlank(quote) || (quote.charAt(0) != '"') || verifyQuotes(line, separator.charAt(0))) {
|
||||
bw.write(line);
|
||||
bw.write("\n");
|
||||
nLines++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// END FIX
|
||||
|
||||
final CSVFormat format = CSVFormat.EXCEL
|
||||
.withHeader()
|
||||
.withDelimiter("\\t".equals(separator) || StringUtils.isBlank(separator) ? '\t' : separator.charAt(0))
|
||||
.withQuote(StringUtils.isBlank(quote) ? null : quote.charAt(0))
|
||||
.withTrim();
|
||||
|
||||
try (InputStream is = this.fileSystem.open(tempPath);
|
||||
InputStreamReader isr = new InputStreamReader(is);
|
||||
BufferedReader br = new BufferedReader(isr);
|
||||
final CSVParser parser = new CSVParser(br, format)) {
|
||||
|
||||
final Set<String> headers = parser.getHeaderMap().keySet();
|
||||
|
||||
final long nRecords = nLines - 1;
|
||||
|
||||
final Iterator<String> iterator = Iterators.transform(parser.iterator(), input -> {
|
||||
try {
|
||||
final Document document = DocumentHelper.createDocument();
|
||||
final Element root = document.addElement("csvRecord");
|
||||
for (final String key : headers) {
|
||||
final Element row = root.addElement("column");
|
||||
final String value = XmlCleaner.cleanAllEntities(input.get(key));
|
||||
if (value != null) {
|
||||
row.addAttribute("name", key).addText(value);
|
||||
}
|
||||
if (key.equals(identifier)) {
|
||||
row.addAttribute("isID", "true");
|
||||
}
|
||||
}
|
||||
|
||||
return document.asXML();
|
||||
} finally {
|
||||
if (parser.getRecordNumber() == nRecords) {
|
||||
try {
|
||||
this.fileSystem.delete(tempPath, false);
|
||||
} catch (final IOException e) {
|
||||
log.warn("Error deleting temp file: " + tempPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
|
||||
|
||||
return StreamSupport.stream(spliterator, false);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
log.error("Error parsing csv", e);
|
||||
throw new CollectorException("Error parsing csv", e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean verifyQuotes(final String line, final char separator) {
|
||||
final char[] cs = line.trim().toCharArray();
|
||||
boolean inField = false;
|
||||
boolean skipNext = false;
|
||||
for (int i = 0; i < cs.length; i++) {
|
||||
if (skipNext) {
|
||||
skipNext = false;
|
||||
} else if (inField) {
|
||||
if ((cs[i] == '\"') && ((i == (cs.length - 1)) || (cs[i + 1] == separator))) {
|
||||
inField = false;
|
||||
} else if ((cs[i] == '\"') && (i < (cs.length - 1))) {
|
||||
if (cs[i + 1] != '\"') {
|
||||
log.warn("Skipped invalid line: " + line);
|
||||
return false;
|
||||
}
|
||||
skipNext = true;
|
||||
}
|
||||
} else if ((cs[i] == '\"') && ((i == 0) || (cs[i - 1] == separator))) {
|
||||
inField = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (inField) {
|
||||
log.warn("Skipped invalid line: " + line);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.gtr2;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
@ -8,17 +10,19 @@ import java.util.LinkedList;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.http.Header;
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -28,12 +32,10 @@ import eu.dnetlib.dhp.common.collection.HttpConnector2;
|
|||
|
||||
public class Gtr2PublicationsIterator implements Iterator<String> {
|
||||
|
||||
public static final int PAGE_SIZE = 20;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(Gtr2PublicationsIterator.class);
|
||||
|
||||
private final HttpConnector2 connector;
|
||||
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");
|
||||
|
||||
private static final int MAX_ATTEMPTS = 10;
|
||||
|
||||
|
@ -41,8 +43,7 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
private int currPage;
|
||||
private int endPage;
|
||||
private boolean incremental = false;
|
||||
private DateTime fromDate;
|
||||
|
||||
private LocalDate fromDate;
|
||||
private final Map<String, String> cache = new HashMap<>();
|
||||
|
||||
private final Queue<String> queue = new LinkedList<>();
|
||||
|
@ -88,7 +89,7 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
|
||||
private void prepareNextElement() {
|
||||
while ((this.currPage <= this.endPage) && this.queue.isEmpty()) {
|
||||
log.debug("FETCHING PAGE + " + this.currPage + "/" + this.endPage);
|
||||
log.info("FETCHING PAGE + " + this.currPage + "/" + this.endPage);
|
||||
this.queue.addAll(fetchPage(this.currPage++));
|
||||
}
|
||||
this.nextElement = this.queue.poll();
|
||||
|
@ -97,18 +98,17 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
private List<String> fetchPage(final int pageNumber) {
|
||||
|
||||
final List<String> res = new ArrayList<>();
|
||||
try {
|
||||
final Document doc = loadURL(cleanURL(this.baseUrl + "/outcomes/publications?p=" + pageNumber), 0);
|
||||
|
||||
if (this.endPage == Integer.MAX_VALUE) {
|
||||
this.endPage = NumberUtils.toInt(doc.valueOf("/*/@*[local-name() = 'totalPages']"));
|
||||
}
|
||||
try {
|
||||
final Document doc = loadURL(this.baseUrl + "/publication?page=" + pageNumber, 0);
|
||||
|
||||
for (final Object po : doc.selectNodes("//*[local-name() = 'publication']")) {
|
||||
|
||||
final Element mainEntity = (Element) ((Element) po).detach();
|
||||
|
||||
if (filterIncremental(mainEntity)) {
|
||||
res.add(expandMainEntity(mainEntity));
|
||||
final String publicationOverview = mainEntity.attributeValue("url");
|
||||
res.add(loadURL(publicationOverview, -1).asXML());
|
||||
} else {
|
||||
log.debug("Skipped entity");
|
||||
}
|
||||
|
@ -122,34 +122,6 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
return res;
|
||||
}
|
||||
|
||||
private void addLinkedEntities(final Element master, final String relType, final Element newRoot,
|
||||
final Function<Document, Element> mapper) {
|
||||
|
||||
for (final Object o : master.selectNodes(".//*[local-name()='link']")) {
|
||||
final String rel = ((Element) o).valueOf("@*[local-name()='rel']");
|
||||
final String href = ((Element) o).valueOf("@*[local-name()='href']");
|
||||
|
||||
if (relType.equals(rel) && StringUtils.isNotBlank(href)) {
|
||||
final String cacheKey = relType + "#" + href;
|
||||
if (this.cache.containsKey(cacheKey)) {
|
||||
try {
|
||||
log.debug(" * from cache (" + relType + "): " + href);
|
||||
newRoot.add(DocumentHelper.parseText(this.cache.get(cacheKey)).getRootElement());
|
||||
} catch (final DocumentException e) {
|
||||
log.error("Error retrieving cache element: " + cacheKey, e);
|
||||
throw new RuntimeException("Error retrieving cache element: " + cacheKey, e);
|
||||
}
|
||||
} else {
|
||||
final Document doc = loadURL(cleanURL(href), 0);
|
||||
final Element elem = mapper.apply(doc);
|
||||
newRoot.add(elem);
|
||||
this.cache.put(cacheKey, elem.asXML());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean filterIncremental(final Element e) {
|
||||
if (!this.incremental || isAfter(e.valueOf("@*[local-name() = 'created']"), this.fromDate)
|
||||
|| isAfter(e.valueOf("@*[local-name() = 'updated']"), this.fromDate)) {
|
||||
|
@ -158,58 +130,52 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
return false;
|
||||
}
|
||||
|
||||
private String expandMainEntity(final Element mainEntity) {
|
||||
final Element newRoot = DocumentHelper.createElement("doc");
|
||||
newRoot.add(mainEntity);
|
||||
addLinkedEntities(mainEntity, "PROJECT", newRoot, this::asProjectElement);
|
||||
return DocumentHelper.createDocument(newRoot).asXML();
|
||||
}
|
||||
|
||||
private Element asProjectElement(final Document doc) {
|
||||
final Element newOrg = DocumentHelper.createElement("project");
|
||||
newOrg.addElement("id").setText(doc.valueOf("/*/@*[local-name()='id']"));
|
||||
newOrg
|
||||
.addElement("code")
|
||||
.setText(doc.valueOf("//*[local-name()='identifier' and @*[local-name()='type'] = 'RCUK']"));
|
||||
newOrg.addElement("title").setText(doc.valueOf("//*[local-name()='title']"));
|
||||
return newOrg;
|
||||
}
|
||||
|
||||
private static String cleanURL(final String url) {
|
||||
String cleaned = url;
|
||||
if (cleaned.contains("gtr.gtr")) {
|
||||
cleaned = cleaned.replace("gtr.gtr", "gtr");
|
||||
}
|
||||
if (cleaned.startsWith("http://")) {
|
||||
cleaned = cleaned.replaceFirst("http://", "https://");
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
private Document loadURL(final String cleanUrl, final int attempt) {
|
||||
try {
|
||||
log.debug(" * Downloading Url: " + cleanUrl);
|
||||
final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes("UTF-8");
|
||||
return DocumentHelper.parseText(new String(bytes));
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
|
||||
final HttpGet req = new HttpGet(cleanUrl);
|
||||
req.setHeader(HttpHeaders.ACCEPT, "application/xml");
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
if (endPage == Integer.MAX_VALUE)
|
||||
for (final Header header : response.getAllHeaders()) {
|
||||
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
|
||||
if ("Link-Pages".equals(header.getName())) {
|
||||
if (Integer.parseInt(header.getValue()) < endPage)
|
||||
endPage = Integer.parseInt(header.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
final String content = IOUtils.toString(response.getEntity().getContent());
|
||||
return DocumentHelper.parseText(content);
|
||||
|
||||
}
|
||||
|
||||
} catch (final Throwable e) {
|
||||
log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e);
|
||||
|
||||
if (attempt == -1)
|
||||
try {
|
||||
return DocumentHelper.parseText("<empty></empty>");
|
||||
} catch (Throwable t) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e);
|
||||
if (attempt >= MAX_ATTEMPTS) {
|
||||
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
|
||||
throw new RuntimeException("Error downloading url: " + cleanUrl, e);
|
||||
}
|
||||
try {
|
||||
Thread.sleep(60000); // I wait for a minute
|
||||
} catch (final InterruptedException e1) {
|
||||
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
|
||||
throw new RuntimeException("Error downloading url: " + cleanUrl, e);
|
||||
}
|
||||
return loadURL(cleanUrl, attempt + 1);
|
||||
}
|
||||
}
|
||||
|
||||
private DateTime parseDate(final String s) {
|
||||
return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
|
||||
private LocalDate parseDate(final String s) {
|
||||
return LocalDate.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
|
||||
}
|
||||
|
||||
private boolean isAfter(final String d, final DateTime fromDate) {
|
||||
private boolean isAfter(final String d, final LocalDate fromDate) {
|
||||
return StringUtils.isNotBlank(d) && parseDate(d).isAfter(fromDate);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,17 +31,19 @@ public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
|
|||
final String baseUrl = api.getBaseUrl();
|
||||
|
||||
final int pageSize = Optional
|
||||
.ofNullable(api.getParams().get("pageSize"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
||||
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
||||
.ofNullable(api.getParams().get("pageSize"))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
|
||||
.orElse(PAGE_SIZE_VALUE_DEFAULT);
|
||||
|
||||
if (StringUtils.isBlank(baseUrl)) { throw new CollectorException("Param 'baseUrl' is null or empty"); }
|
||||
if (StringUtils.isBlank(baseUrl)) {
|
||||
throw new CollectorException("Param 'baseUrl' is null or empty");
|
||||
}
|
||||
|
||||
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());
|
||||
|
||||
return StreamSupport
|
||||
.stream(Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
|
||||
.stream(Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
|
||||
}
|
||||
|
||||
public HttpClientParams getClientParams() {
|
||||
|
|
|
@ -34,9 +34,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
||||
|
||||
public OsfPreprintsIterator(
|
||||
final String baseUrl,
|
||||
final int pageSize,
|
||||
final HttpClientParams clientParams) {
|
||||
final String baseUrl,
|
||||
final int pageSize,
|
||||
final HttpClientParams clientParams) {
|
||||
|
||||
this.clientParams = clientParams;
|
||||
this.baseUrl = baseUrl;
|
||||
|
@ -54,7 +54,8 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
@Override
|
||||
public boolean hasNext() {
|
||||
synchronized (this.recordQueue) {
|
||||
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl) && this.currentUrl.startsWith("http")) {
|
||||
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl)
|
||||
&& this.currentUrl.startsWith("http")) {
|
||||
try {
|
||||
this.currentUrl = downloadPage(this.currentUrl);
|
||||
} catch (final CollectorException e) {
|
||||
|
@ -63,7 +64,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
}
|
||||
}
|
||||
|
||||
if (!this.recordQueue.isEmpty()) { return true; }
|
||||
if (!this.recordQueue.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -112,7 +115,9 @@ public class OsfPreprintsIterator implements Iterator<String> {
|
|||
}
|
||||
|
||||
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
|
||||
if (attempt > MAX_ATTEMPTS) { throw new CollectorException("Max Number of attempts reached, url:" + url); }
|
||||
if (attempt > MAX_ATTEMPTS) {
|
||||
throw new CollectorException("Max Number of attempts reached, url:" + url);
|
||||
}
|
||||
|
||||
if (attempt > 0) {
|
||||
final int delay = (attempt * 5000);
|
||||
|
|
|
@ -6,7 +6,7 @@ import java.util.Queue;
|
|||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.http.Header;
|
||||
|
@ -27,25 +27,25 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
|
||||
private final String baseUrl;
|
||||
private final String authToken;
|
||||
private int currPage;
|
||||
private int nPages;
|
||||
private String nextUrl;
|
||||
private int nCalls = 0;
|
||||
|
||||
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
||||
|
||||
public ResearchFiIterator(final String baseUrl, final String authToken) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.authToken = authToken;
|
||||
this.currPage = 0;
|
||||
this.nPages = 0;
|
||||
this.nextUrl = null;
|
||||
}
|
||||
|
||||
private void verifyStarted() {
|
||||
if (this.currPage == 0) {
|
||||
try {
|
||||
nextCall();
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
|
||||
try {
|
||||
if (this.nCalls == 0) {
|
||||
this.nextUrl = invokeUrl(this.baseUrl);
|
||||
}
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -62,9 +62,9 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
synchronized (this.queue) {
|
||||
verifyStarted();
|
||||
final String res = this.queue.poll();
|
||||
while (this.queue.isEmpty() && (this.currPage < this.nPages)) {
|
||||
while (this.queue.isEmpty() && StringUtils.isNotBlank(this.nextUrl)) {
|
||||
try {
|
||||
nextCall();
|
||||
this.nextUrl = invokeUrl(this.nextUrl);
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
|
@ -73,18 +73,11 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
}
|
||||
}
|
||||
|
||||
private void nextCall() throws CollectorException {
|
||||
private String invokeUrl(final String url) throws CollectorException {
|
||||
|
||||
this.currPage += 1;
|
||||
this.nCalls += 1;
|
||||
String next = null;
|
||||
|
||||
final String url;
|
||||
if (!this.baseUrl.contains("?")) {
|
||||
url = String.format("%s?PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
|
||||
} else if (!this.baseUrl.contains("PageSize=")) {
|
||||
url = String.format("%s&PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
|
||||
} else {
|
||||
url = String.format("%s&PageNumber=%d", this.baseUrl, this.currPage);
|
||||
}
|
||||
log.info("Calling url: " + url);
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
|
@ -94,11 +87,15 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
for (final Header header : response.getAllHeaders()) {
|
||||
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
|
||||
if ("x-page-count".equals(header.getName())) {
|
||||
final int totalPages = NumberUtils.toInt(header.getValue());
|
||||
if (this.nPages != totalPages) {
|
||||
this.nPages = NumberUtils.toInt(header.getValue());
|
||||
log.info("Total pages: " + totalPages);
|
||||
if ("link".equals(header.getName())) {
|
||||
final String s = StringUtils.substringBetween(header.getValue(), "<", ">");
|
||||
final String token = StringUtils
|
||||
.substringBefore(StringUtils.substringAfter(s, "NextPageToken="), "&");
|
||||
|
||||
if (this.baseUrl.contains("?")) {
|
||||
next = this.baseUrl + "&NextPageToken=" + token;
|
||||
} else {
|
||||
next = this.baseUrl + "?NextPageToken=" + token;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -108,6 +105,9 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
|
||||
jsonArray.forEach(obj -> this.queue.add(JsonUtils.convertToXML(obj.toString())));
|
||||
}
|
||||
|
||||
return next;
|
||||
|
||||
} catch (final Throwable e) {
|
||||
log.warn("Error calling url: " + url, e);
|
||||
throw new CollectorException("Error calling url: " + url, e);
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.zenodo;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.apache.http.client.config.RequestConfig;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClientBuilder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class CollectZenodoDumpCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
final private Logger log = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private void downloadItem(final String name, final String itemURL, final String basePath,
|
||||
final FileSystem fileSystem) {
|
||||
try {
|
||||
final Path hdfsWritePath = new Path(String.format("%s/%s", basePath, name));
|
||||
final FSDataOutputStream fsDataOutputStream = fileSystem.create(hdfsWritePath, true);
|
||||
final HttpGet request = new HttpGet(itemURL);
|
||||
final int timeout = 60; // seconds
|
||||
final RequestConfig config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build();
|
||||
log.info("Downloading url {} into {}", itemURL, hdfsWritePath.getName());
|
||||
try (CloseableHttpClient client = HttpClientBuilder.create().setDefaultRequestConfig(config).build();
|
||||
CloseableHttpResponse response = client.execute(request)) {
|
||||
int responseCode = response.getStatusLine().getStatusCode();
|
||||
log.info("Response code is {}", responseCode);
|
||||
if (responseCode >= 200 && responseCode < 400) {
|
||||
IOUtils.copy(response.getEntity().getContent(), fsDataOutputStream);
|
||||
}
|
||||
} catch (Throwable eu) {
|
||||
throw new RuntimeException(eu);
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
|
||||
try {
|
||||
final String zenodoURL = api.getBaseUrl();
|
||||
final String hdfsURI = api.getParams().get("hdfsURI");
|
||||
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsURI));
|
||||
downloadItem("zenodoDump.tar.gz", zenodoURL, "/tmp", fileSystem);
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(fileSystem.getConf());
|
||||
|
||||
Path sourcePath = new Path("/tmp/zenodoDump.tar.gz");
|
||||
CompressionCodec codec = factory.getCodec(sourcePath);
|
||||
InputStream gzipInputStream = null;
|
||||
try {
|
||||
gzipInputStream = codec.createInputStream(fileSystem.open(sourcePath));
|
||||
return iterateTar(gzipInputStream);
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new CollectorException(e);
|
||||
} finally {
|
||||
log.info("Closing gzip stream");
|
||||
org.apache.hadoop.io.IOUtils.closeStream(gzipInputStream);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private Stream<String> iterateTar(InputStream gzipInputStream) throws Exception {
|
||||
|
||||
Iterable<String> iterable = () -> new ZenodoTarIterator(gzipInputStream);
|
||||
return StreamSupport.stream(iterable.spliterator(), false);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.zenodo;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
public class ZenodoTarIterator implements Iterator<String>, Closeable {
|
||||
|
||||
private final InputStream gzipInputStream;
|
||||
private final StringBuilder currentItem = new StringBuilder();
|
||||
private TarArchiveInputStream tais;
|
||||
private boolean hasNext;
|
||||
|
||||
public ZenodoTarIterator(InputStream gzipInputStream) {
|
||||
this.gzipInputStream = gzipInputStream;
|
||||
tais = new TarArchiveInputStream(gzipInputStream);
|
||||
hasNext = getNextItem();
|
||||
}
|
||||
|
||||
private boolean getNextItem() {
|
||||
try {
|
||||
TarArchiveEntry entry;
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
if (entry.isFile()) {
|
||||
currentItem.setLength(0);
|
||||
currentItem.append(IOUtils.toString(new InputStreamReader(tais)));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return hasNext;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
final String data = currentItem.toString();
|
||||
hasNext = getNextItem();
|
||||
return data;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
gzipInputStream.close();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
/**
|
||||
* The type Pubmed Affiliation.
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class PMAffiliation {
|
||||
|
||||
private String name;
|
||||
|
||||
private PMIdentifier identifier;
|
||||
|
||||
public PMAffiliation() {
|
||||
|
||||
}
|
||||
|
||||
public PMAffiliation(String name, PMIdentifier identifier) {
|
||||
this.name = name;
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public PMIdentifier getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(PMIdentifier identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
}
|
|
@ -8,259 +8,115 @@ import java.util.List;
|
|||
/**
|
||||
* This class represent an instance of Pubmed Article extracted from the native XML
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
|
||||
public class PMArticle implements Serializable {
|
||||
|
||||
/**
|
||||
* the Pubmed Identifier
|
||||
*/
|
||||
private String pmid;
|
||||
|
||||
private String pmcId;
|
||||
|
||||
/**
|
||||
* the DOI
|
||||
*/
|
||||
private String doi;
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*/
|
||||
private String date;
|
||||
/**
|
||||
* This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
|
||||
*/
|
||||
private PMJournal journal;
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*/
|
||||
private String title;
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*/
|
||||
private String description;
|
||||
/**
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. *
|
||||
*/
|
||||
private final List<PMSubject> subjects = new ArrayList<>();
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*/
|
||||
private final List<PMSubject> publicationTypes = new ArrayList<>();
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*/
|
||||
private List<PMSubject> subjects;
|
||||
private List<PMSubject> publicationTypes = new ArrayList<>();
|
||||
private List<PMAuthor> authors = new ArrayList<>();
|
||||
private List<PMGrant> grants = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
*/
|
||||
private final List<PMGrant> grants = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* get the DOI
|
||||
* @return a DOI
|
||||
*/
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the DOI
|
||||
* @param doi a DOI
|
||||
*/
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the Pubmed Identifier
|
||||
* @return the PMID
|
||||
*/
|
||||
public String getPmid() {
|
||||
return pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the Pubmed Identifier
|
||||
* @param pmid the Pubmed Identifier
|
||||
*/
|
||||
public void setPmid(String pmid) {
|
||||
this.pmid = pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*
|
||||
* @return the Pubmed Date
|
||||
*/
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the pubmed Date
|
||||
* @param date
|
||||
*/
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
|
||||
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
|
||||
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*
|
||||
* @return the pubmed Journal Extracted
|
||||
*/
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the mapped pubmed Journal
|
||||
* @param journal
|
||||
*/
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
|
||||
* those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
|
||||
* All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
|
||||
* Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
|
||||
* Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
|
||||
*
|
||||
* @return the extracted pubmed Title
|
||||
*/
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the pubmed title
|
||||
* @param title
|
||||
*/
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*
|
||||
* @return the Mapped Pubmed Article Abstracts
|
||||
*/
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Pubmed Article Abstracts
|
||||
* @param description
|
||||
*/
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*
|
||||
* @return get the Mapped Authors lists
|
||||
*/
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Authors lists
|
||||
* @param authors
|
||||
*/
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Subjects
|
||||
*/
|
||||
public List<PMSubject> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*
|
||||
* @return The mapped Language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Set The mapped Language
|
||||
*
|
||||
* @param language the mapped Language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Publication Type
|
||||
*/
|
||||
public List<PMSubject> getPublicationTypes() {
|
||||
return publicationTypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
* @return the mapped grants
|
||||
*/
|
||||
|
||||
public List<PMGrant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public String getPmcId() {
|
||||
return pmcId;
|
||||
}
|
||||
|
||||
public PMArticle setPmcId(String pmcId) {
|
||||
public void setPmcId(String pmcId) {
|
||||
this.pmcId = pmcId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public List<PMSubject> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(List<PMSubject> subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public List<PMSubject> getPublicationTypes() {
|
||||
return publicationTypes;
|
||||
}
|
||||
|
||||
public void setPublicationTypes(List<PMSubject> publicationTypes) {
|
||||
this.publicationTypes = publicationTypes;
|
||||
}
|
||||
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public List<PMGrant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public void setGrants(List<PMGrant> grants) {
|
||||
this.grants = grants;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@ public class PMAuthor implements Serializable {
|
|||
|
||||
private String lastName;
|
||||
private String foreName;
|
||||
private PMIdentifier identifier;
|
||||
private PMAffiliation affiliation;
|
||||
|
||||
/**
|
||||
* Gets last name.
|
||||
|
@ -59,4 +61,40 @@ public class PMAuthor implements Serializable {
|
|||
.format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets identifier.
|
||||
*
|
||||
* @return the identifier
|
||||
*/
|
||||
public PMIdentifier getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets identifier.
|
||||
*
|
||||
* @param identifier the identifier
|
||||
*/
|
||||
public void setIdentifier(PMIdentifier identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets affiliation.
|
||||
*
|
||||
* @return the affiliation
|
||||
*/
|
||||
public PMAffiliation getAffiliation() {
|
||||
return affiliation;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets affiliation.
|
||||
*
|
||||
* @param affiliation the affiliation
|
||||
*/
|
||||
public void setAffiliation(PMAffiliation affiliation) {
|
||||
this.affiliation = affiliation;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
public class PMIdentifier {
|
||||
|
||||
private String pid;
|
||||
private String type;
|
||||
|
||||
public PMIdentifier(String pid, String type) {
|
||||
this.pid = cleanPid(pid);
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public PMIdentifier() {
|
||||
|
||||
}
|
||||
|
||||
private String cleanPid(String pid) {
|
||||
|
||||
if (pid == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705
|
||||
if (pid.matches("[0-9]{15}[0-9X]")) {
|
||||
return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4");
|
||||
}
|
||||
|
||||
// clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543
|
||||
if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) {
|
||||
return pid.replaceAll("http://orcid.org/", "");
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
|
||||
public String getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public PMIdentifier setPid(String pid) {
|
||||
this.pid = cleanPid(pid);
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public PMIdentifier setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -28,13 +28,19 @@
|
|||
"paramLongName": "dataciteInputPath",
|
||||
"paramDescription": "the path to get the input data from Datacite",
|
||||
"paramRequired": true
|
||||
},{
|
||||
},
|
||||
{
|
||||
"paramName": "wip",
|
||||
"paramLongName": "webCrawlInputPath",
|
||||
"paramDescription": "the path to get the input data from Web Crawl",
|
||||
"paramRequired": true
|
||||
}
|
||||
,
|
||||
},
|
||||
{
|
||||
"paramName": "pub",
|
||||
"paramLongName": "publisherInputPath",
|
||||
"paramDescription": "the path to get the input data from publishers",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
|
|
|
@ -31,9 +31,11 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
|
|||
# The following is needed as a property of a workflow
|
||||
oozie.wf.application.path=${oozieTopWfApplicationPath}
|
||||
|
||||
crossrefInputPath=/data/bip-affiliations/crossref-data.json
|
||||
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
|
||||
openapcInputPath=/data/bip-affiliations/openapc-data.json
|
||||
dataciteInputPath=/data/bip-affiliations/datacite-data.json
|
||||
crossrefInputPath=/data/openaire-affiliations/crossref-data.json
|
||||
pubmedInputPath=/data/openaire-affiliations/pubmed-data-v4.json
|
||||
openapcInputPath=/data/openaire-affiliations/openapc-data.json
|
||||
dataciteInputPath=/data/openaire-affiliations/datacite-data.json
|
||||
webCrawlInputPath=/data/openaire-affiliations/webCrawl
|
||||
publisherInputPath=/data/openaire-affiliations/publishers
|
||||
|
||||
outputPath=/tmp/crossref-affiliations-output-v5
|
||||
outputPath=/tmp/affRoAS
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="BipAffiliations" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="OpenAIREAffiliations" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
|
||||
<property>
|
||||
|
@ -21,6 +21,10 @@
|
|||
<name>webCrawlInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>publisherInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from publisher websites</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
|
@ -99,7 +103,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the inferred by BIP! affiliation relations (from Crossref and Pubmed)</name>
|
||||
<name>Produces the atomic action with the inferred by OpenAIRE affiliation relations</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -117,6 +121,7 @@
|
|||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||
<arg>--publisherInputPath</arg><arg>${publisherInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -16,5 +16,11 @@
|
|||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bp",
|
||||
"paramLongName": "backupPath",
|
||||
"paramDescription": "the hdfs path to move the OC data after the extraction",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -24,12 +24,13 @@
|
|||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}, {
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -94,17 +94,7 @@
|
|||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/Original</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/Extracted</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="extract_correspondence">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/correspondence</arg>
|
||||
<arg>--outputPath</arg><arg>${inputPath}/correspondence_extracted</arg>
|
||||
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -21,5 +21,30 @@
|
|||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pu",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "ps",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pp",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
||||
|
|
|
@ -1,2 +1,5 @@
|
|||
inputPath=/data/orcid_2023/tables/
|
||||
outputPath=/user/miriam.baglioni/peopleAS
|
||||
outputPath=/user/miriam.baglioni/peopleAS
|
||||
postgresUrl=jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus
|
||||
postgresUser=dnet
|
||||
postgresPassword=dnetPwd
|
|
@ -9,6 +9,18 @@
|
|||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUrl</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -102,6 +114,10 @@
|
|||
<arg>--inputPath</arg><arg>${inputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--postgresUrl</arg><arg>${postgresUrl}</arg>
|
||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path of the input json",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,53 @@
|
|||
<workflow-app name="Update_RAiD_action_set" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>raidJsonInputPath</name>
|
||||
<description>the path of the json</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>raidActionSetPath</name>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="deleteoutputpath">
|
||||
<fs>
|
||||
<delete path='${raidActionSetPath}'/>
|
||||
<mkdir path='${raidActionSetPath}'/>
|
||||
</fs>
|
||||
<ok to="processRAiDFile"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="processRAiDFile">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ProcessRAiDFile</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${raidJsonInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${raidActionSetPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -24,7 +24,7 @@
|
|||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||
<case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -33,6 +33,14 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="reset_workingDir">
|
||||
<fs>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="download"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="download">
|
||||
<shell xmlns="uri:oozie:shell-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false},
|
||||
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true},
|
||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the mdstore path to save", "paramRequired": true}
|
||||
|
||||
]
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>baselineWorkingPath</name>
|
||||
|
@ -16,11 +16,6 @@
|
|||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>skipUpdate</name>
|
||||
<value>false</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="StartTransaction"/>
|
||||
|
@ -44,16 +39,16 @@
|
|||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="ConvertDataset"/>
|
||||
<ok to="TransformPubMed"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="ConvertDataset">
|
||||
<action name="TransformPubMed">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Baseline to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
|
||||
<name>Convert Baseline Pubmed to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
|
@ -65,12 +60,10 @@
|
|||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${baselineWorkingPath}</arg>
|
||||
<arg>--sourcePath</arg><arg>${baselineWorkingPath}</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="RollBack"/>
|
||||
|
|
|
@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{
|
|||
PidType
|
||||
}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang.StringUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.apache.spark.sql.Row
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
|
@ -37,7 +37,7 @@ case class mappingAuthor(
|
|||
family: Option[String],
|
||||
sequence: Option[String],
|
||||
ORCID: Option[String],
|
||||
affiliation: Option[mappingAffiliation]
|
||||
affiliation: Option[List[mappingAffiliation]]
|
||||
) {}
|
||||
|
||||
case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {}
|
||||
|
@ -332,7 +332,7 @@ case object Crossref2Oaf {
|
|||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
//MAPPING Crossref DOI into PID
|
||||
val doi: String = DoiCleaningRule.normalizeDoi((json \ "DOI").extract[String])
|
||||
val doi: String = DoiCleaningRule.clean((json \ "DOI").extract[String])
|
||||
result.setPid(
|
||||
List(
|
||||
structuredProperty(
|
||||
|
@ -457,15 +457,14 @@ case object Crossref2Oaf {
|
|||
}
|
||||
|
||||
//Mapping Author
|
||||
val authorList: List[mappingAuthor] =
|
||||
(json \ "author").extract[List[mappingAuthor]].filter(a => a.family.isDefined)
|
||||
val authorList: List[mappingAuthor] = (json \ "author").extract[List[mappingAuthor]].filter(a => a.family.isDefined)
|
||||
|
||||
val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) =>
|
||||
a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")
|
||||
)
|
||||
|
||||
result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) =>
|
||||
generateAuhtor(a.given.orNull, a.family.get, a.ORCID.orNull, index)
|
||||
generateAuthor(a.given.orNull, a.family.get, a.ORCID.orNull, index, a.affiliation)
|
||||
}.asJava)
|
||||
|
||||
// Mapping instance
|
||||
|
@ -504,6 +503,11 @@ case object Crossref2Oaf {
|
|||
)
|
||||
}
|
||||
|
||||
if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
|
||||
instance.setHostedby(
|
||||
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
|
||||
)
|
||||
|
||||
instance.setAccessright(
|
||||
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
|
||||
)
|
||||
|
@ -556,12 +560,23 @@ case object Crossref2Oaf {
|
|||
s"50|doiboost____|$id"
|
||||
}
|
||||
|
||||
def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = {
|
||||
private def generateAuthor(
|
||||
given: String,
|
||||
family: String,
|
||||
orcid: String,
|
||||
index: Int,
|
||||
affiliation: Option[List[mappingAffiliation]]
|
||||
): Author = {
|
||||
val a = new Author
|
||||
a.setName(given)
|
||||
a.setSurname(family)
|
||||
a.setFullname(s"$given $family")
|
||||
a.setRank(index + 1)
|
||||
|
||||
// Adding Raw affiliation if it's defined
|
||||
if (affiliation.isDefined) {
|
||||
a.setRawAffiliationString(affiliation.get.map(a => a.name).asJava)
|
||||
}
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(
|
||||
List(
|
||||
|
@ -655,11 +670,11 @@ case object Crossref2Oaf {
|
|||
val doi = input.getString(0)
|
||||
val rorId = input.getString(1)
|
||||
|
||||
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.normalizeDoi(doi)}"
|
||||
val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true)
|
||||
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
||||
|
||||
val r: Relation = new Relation
|
||||
DoiCleaningRule.clean(doi)
|
||||
|
||||
r.setSource(pubId)
|
||||
r.setTarget(affId)
|
||||
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
|
||||
|
@ -687,7 +702,15 @@ case object Crossref2Oaf {
|
|||
val objectType = (json \ "type").extractOrElse[String](null)
|
||||
if (objectType == null)
|
||||
return resultList
|
||||
val typology = getTypeQualifier(objectType, vocabularies)
|
||||
|
||||
// If the item has a relations is-review-of, then we force it to a peer-review
|
||||
val is_review = json \ "relation" \ "is-review-of" \ "id"
|
||||
var force_to_review = false
|
||||
if (is_review != JNothing) {
|
||||
force_to_review = true
|
||||
}
|
||||
|
||||
val typology = getTypeQualifier(if (force_to_review) "peer-review" else objectType, vocabularies)
|
||||
|
||||
if (typology == null)
|
||||
return List()
|
||||
|
@ -739,33 +762,6 @@ case object Crossref2Oaf {
|
|||
else
|
||||
resultList
|
||||
}
|
||||
|
||||
// if (uw != null) {
|
||||
// result.getCollectedfrom.add(createUnpayWallCollectedFrom())
|
||||
// val i: Instance = new Instance()
|
||||
// i.setCollectedfrom(createUnpayWallCollectedFrom())
|
||||
// if (uw.best_oa_location != null) {
|
||||
//
|
||||
// i.setUrl(List(uw.best_oa_location.url).asJava)
|
||||
// if (uw.best_oa_location.license.isDefined) {
|
||||
// i.setLicense(field[String](uw.best_oa_location.license.get, null))
|
||||
// }
|
||||
//
|
||||
// val colour = get_unpaywall_color(uw.oa_status)
|
||||
// if (colour.isDefined) {
|
||||
// val a = new AccessRight
|
||||
// a.setClassid(ModelConstants.ACCESS_RIGHT_OPEN)
|
||||
// a.setClassname(ModelConstants.ACCESS_RIGHT_OPEN)
|
||||
// a.setSchemeid(ModelConstants.DNET_ACCESS_MODES)
|
||||
// a.setSchemename(ModelConstants.DNET_ACCESS_MODES)
|
||||
// a.setOpenAccessRoute(colour.get)
|
||||
// i.setAccessright(a)
|
||||
// }
|
||||
// i.setPid(result.getPid)
|
||||
// result.getInstance().add(i)
|
||||
// }
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
private def createCiteRelation(source: Result, targetPid: String, targetPidType: String): List[Relation] = {
|
||||
|
@ -960,7 +956,26 @@ case object Crossref2Oaf {
|
|||
case "10.13039/501100010790" =>
|
||||
generateSimpleRelationFromAward(funder, "erasmusplus_", a => a)
|
||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||
|
||||
//Add for Danish funders
|
||||
//Independent Research Fund Denmark (IRFD)
|
||||
case "10.13039/501100004836" =>
|
||||
generateSimpleRelationFromAward(funder, "irfd________", a => a)
|
||||
val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
//Carlsberg Foundation (CF)
|
||||
case "10.13039/501100002808" =>
|
||||
generateSimpleRelationFromAward(funder, "cf__________", a => a)
|
||||
val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
//Novo Nordisk Foundation (NNF)
|
||||
case "10.13039/501100009708" =>
|
||||
generateSimpleRelationFromAward(funder, "nnf___________", a => a)
|
||||
val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63")
|
||||
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
|
||||
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
|
||||
case _ => logger.debug("no match for " + funder.DOI.get)
|
||||
}
|
||||
|
||||
} else {
|
||||
|
|
|
@ -407,10 +407,9 @@ object DataciteToOAFTransformation {
|
|||
)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(
|
||||
a.setRawAffiliationString(
|
||||
c.affiliation.get
|
||||
.filter(af => af.nonEmpty)
|
||||
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||
.asJava
|
||||
)
|
||||
a.setRank(idx + 1)
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.common.Constants
|
||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser2, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.transformation.TransformSparkJobNode
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||
val mapper = new ObjectMapper()
|
||||
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info(s"outputBasePath is '$outputBasePath'")
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
|
||||
createPubmedDump(spark, sourcePath, outputBasePath, vocabularies)
|
||||
|
||||
}
|
||||
|
||||
/** This method creates a dump of the pubmed articles
|
||||
* @param spark the spark session
|
||||
* @param sourcePath the path of the source file
|
||||
* @param targetPath the path of the target file
|
||||
* @param vocabularies the vocabularies
|
||||
*/
|
||||
def createPubmedDump(
|
||||
spark: SparkSession,
|
||||
sourcePath: String,
|
||||
targetPath: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): Unit = {
|
||||
require(spark != null)
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.bean(classOf[PMArticle])
|
||||
|
||||
import spark.implicits._
|
||||
val df = spark.read.option("lineSep", "</PubmedArticle>").text(sourcePath)
|
||||
val mapper = new ObjectMapper()
|
||||
df.as[String]
|
||||
.map(s => {
|
||||
val id = s.indexOf("<PubmedArticle>")
|
||||
if (id >= 0) s"${s.substring(id)}</PubmedArticle>" else null
|
||||
})
|
||||
.filter(s => s != null)
|
||||
.map { i =>
|
||||
//remove try catch
|
||||
try {
|
||||
new PMParser2().parse(i)
|
||||
} catch {
|
||||
case _: Exception => {
|
||||
throw new RuntimeException(s"Error parsing article: $i")
|
||||
}
|
||||
}
|
||||
}
|
||||
.dropDuplicates("pmid")
|
||||
.map { a =>
|
||||
val oaf = PubMedToOaf.convert(a, vocabularies)
|
||||
if (oaf != null)
|
||||
mapper.writeValueAsString(oaf)
|
||||
else
|
||||
null
|
||||
}
|
||||
.as[String]
|
||||
.filter(s => s != null)
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.mode("overwrite")
|
||||
.text(targetPath + MDSTORE_DATA_PATH)
|
||||
|
||||
val mdStoreSize = spark.read.text(targetPath + MDSTORE_DATA_PATH).count
|
||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, "" + mdStoreSize, targetPath + MDSTORE_SIZE_PATH)
|
||||
}
|
||||
}
|
||||
|
||||
object SparkCreatePubmedDump {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
new SparkCreatePubmedDump("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json", args, log).initialize().run()
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,277 @@
|
|||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
|
||||
import javax.xml.stream.XMLEventReader
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.xml.{MetaData, NodeSeq}
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
|
||||
|
||||
class PMParser2 {
|
||||
|
||||
/** Extracts the value of an attribute from a MetaData object.
|
||||
* @param attrs the MetaData object
|
||||
* @param key the key of the attribute
|
||||
* @return the value of the attribute or null if the attribute is not found
|
||||
*/
|
||||
private def extractAttributes(attrs: MetaData, key: String): String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s = res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
} else null
|
||||
}
|
||||
|
||||
/** Validates and formats a date given the year, month, and day as strings.
|
||||
*
|
||||
* @param year the year as a string
|
||||
* @param month the month as a string
|
||||
* @param day the day as a string
|
||||
* @return the formatted date as "YYYY-MM-DD" or null if the date is invalid
|
||||
*/
|
||||
private def validate_Date(year: String, month: String, day: String): String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable => null
|
||||
}
|
||||
}
|
||||
|
||||
/** Extracts the grant information from a NodeSeq object.
|
||||
*
|
||||
* @param gNode the NodeSeq object
|
||||
* @return the grant information or an empty list if the grant information is not found
|
||||
*/
|
||||
private def extractGrant(gNode: NodeSeq): List[PMGrant] = {
|
||||
gNode
|
||||
.map(node => {
|
||||
val grantId = (node \ "GrantID").text
|
||||
val agency = (node \ "Agency").text
|
||||
val country = (node \ "Country").text
|
||||
new PMGrant(grantId, agency, country)
|
||||
})
|
||||
.toList
|
||||
}
|
||||
|
||||
/** Extracts the journal information from a NodeSeq object.
|
||||
*
|
||||
* @param jNode the NodeSeq object
|
||||
* @return the journal information or null if the journal information is not found
|
||||
*/
|
||||
private def extractJournal(jNode: NodeSeq): PMJournal = {
|
||||
val journal = new PMJournal
|
||||
journal.setTitle((jNode \ "Title").text)
|
||||
journal.setIssn((jNode \ "ISSN").text)
|
||||
journal.setVolume((jNode \ "JournalIssue" \ "Volume").text)
|
||||
journal.setIssue((jNode \ "JournalIssue" \ "Issue").text)
|
||||
if (journal.getTitle != null && StringUtils.isNotEmpty(journal.getTitle))
|
||||
journal
|
||||
else
|
||||
null
|
||||
}
|
||||
|
||||
private def extractAuthors(aNode: NodeSeq): List[PMAuthor] = {
|
||||
aNode
|
||||
.map(author => {
|
||||
val a = new PMAuthor
|
||||
a.setLastName((author \ "LastName").text)
|
||||
a.setForeName((author \ "ForeName").text)
|
||||
val id = (author \ "Identifier").text
|
||||
val idType = (author \ "Identifier" \ "@Source").text
|
||||
|
||||
if (id != null && id.nonEmpty && idType != null && idType.nonEmpty) {
|
||||
a.setIdentifier(new PMIdentifier(id, idType))
|
||||
}
|
||||
|
||||
val affiliation = (author \ "AffiliationInfo" \ "Affiliation").text
|
||||
val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text
|
||||
val affiliationIdType = (author \ "AffiliationInfo" \ "Identifier" \ "@Source").text
|
||||
|
||||
if (affiliation != null && affiliation.nonEmpty) {
|
||||
val aff = new PMAffiliation()
|
||||
aff.setName(affiliation)
|
||||
if (
|
||||
affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty
|
||||
) {
|
||||
aff.setIdentifier(new PMIdentifier(affiliationId, affiliationIdType))
|
||||
}
|
||||
a.setAffiliation(aff)
|
||||
}
|
||||
a
|
||||
})
|
||||
.toList
|
||||
}
|
||||
|
||||
def parse(input: String): PMArticle = {
|
||||
val xml = scala.xml.XML.loadString(input)
|
||||
val article = new PMArticle
|
||||
|
||||
val grantNodes = xml \ "MedlineCitation" \\ "Grant"
|
||||
article.setGrants(extractGrant(grantNodes).asJava)
|
||||
|
||||
val journal = xml \ "MedlineCitation" \ "Article" \ "Journal"
|
||||
article.setJournal(extractJournal(journal))
|
||||
|
||||
val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author"
|
||||
|
||||
article.setAuthors(
|
||||
extractAuthors(authors).asJava
|
||||
)
|
||||
|
||||
val pmId = xml \ "MedlineCitation" \ "PMID"
|
||||
|
||||
val articleIds = xml \ "PubmedData" \ "ArticleIdList" \ "ArticleId"
|
||||
articleIds.foreach(articleId => {
|
||||
val idType = (articleId \ "@IdType").text
|
||||
val id = articleId.text
|
||||
if ("doi".equalsIgnoreCase(idType)) article.setDoi(id)
|
||||
if ("pmc".equalsIgnoreCase(idType)) article.setPmcId(id)
|
||||
})
|
||||
article.setPmid(pmId.text)
|
||||
|
||||
val pubMedPubDate = xml \ "MedlineCitation" \ "DateCompleted"
|
||||
val currentDate =
|
||||
validate_Date((pubMedPubDate \ "Year").text, (pubMedPubDate \ "Month").text, (pubMedPubDate \ "Day").text)
|
||||
if (currentDate != null) article.setDate(currentDate)
|
||||
|
||||
val articleTitle = xml \ "MedlineCitation" \ "Article" \ "ArticleTitle"
|
||||
article.setTitle(articleTitle.text)
|
||||
|
||||
val abstractText = xml \ "MedlineCitation" \ "Article" \ "Abstract" \ "AbstractText"
|
||||
if (abstractText != null && abstractText.text != null && abstractText.text.nonEmpty)
|
||||
article.setDescription(abstractText.text.split("\n").map(s => s.trim).mkString(" ").trim)
|
||||
|
||||
val language = xml \ "MedlineCitation" \ "Article" \ "Language"
|
||||
article.setLanguage(language.text)
|
||||
|
||||
val subjects = xml \ "MedlineCitation" \ "MeshHeadingList" \ "MeshHeading"
|
||||
article.setSubjects(
|
||||
subjects
|
||||
.take(20)
|
||||
.map(subject => {
|
||||
val descriptorName = (subject \ "DescriptorName").text
|
||||
val ui = (subject \ "DescriptorName" \ "@UI").text
|
||||
val s = new PMSubject
|
||||
s.setValue(descriptorName)
|
||||
s.setMeshId(ui)
|
||||
s
|
||||
})
|
||||
.toList
|
||||
.asJava
|
||||
)
|
||||
val publicationTypes = xml \ "MedlineCitation" \ "Article" \ "PublicationTypeList" \ "PublicationType"
|
||||
article.setPublicationTypes(
|
||||
publicationTypes
|
||||
.map(pt => {
|
||||
val s = new PMSubject
|
||||
s.setValue(pt.text)
|
||||
s
|
||||
})
|
||||
.toList
|
||||
.asJava
|
||||
)
|
||||
|
||||
article
|
||||
}
|
||||
|
||||
def parse2(xml: XMLEventReader): PMArticle = {
|
||||
var currentArticle: PMArticle = null
|
||||
var currentSubject: PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType: String = null
|
||||
|
||||
while (xml.hasNext) {
|
||||
val ne = xml.next
|
||||
ne match {
|
||||
case EvElemStart(_, label, attrs, _) =>
|
||||
currNode = label
|
||||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" =>
|
||||
if (currentArticle.getDate == null)
|
||||
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode != null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle == null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription == null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" =>
|
||||
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume(text.trim)
|
||||
case "Issue" => currentJournal.setIssue(text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" =>
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle == null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
}
|
|
@ -294,6 +294,24 @@ object PubMedToOaf {
|
|||
author.setName(a.getForeName)
|
||||
author.setSurname(a.getLastName)
|
||||
author.setFullname(a.getFullName)
|
||||
if (a.getIdentifier != null) {
|
||||
author.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
a.getIdentifier.getPid,
|
||||
OafMapperUtils.qualifier(
|
||||
a.getIdentifier.getType,
|
||||
a.getIdentifier.getType,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
dataInfo
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
}
|
||||
if (a.getAffiliation != null)
|
||||
author.setRawAffiliationString(List(a.getAffiliation.getName).asJava)
|
||||
author.setRank(index + 1)
|
||||
author
|
||||
}(collection.breakOut)
|
||||
|
|
|
@ -28,8 +28,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
public class PrepareAffiliationRelationsTest {
|
||||
|
||||
|
@ -39,8 +39,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
private static Path workingDir;
|
||||
private static final String ID_PREFIX = "50|doi_________::";
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(PrepareAffiliationRelationsTest.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelationsTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
|
@ -74,21 +73,34 @@ public class PrepareAffiliationRelationsTest {
|
|||
@Test
|
||||
void testMatch() throws Exception {
|
||||
|
||||
String crossrefAffiliationRelationPath = getClass()
|
||||
String crossrefAffiliationRelationPathNew = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
||||
.getPath();
|
||||
|
||||
String crossrefAffiliationRelationPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror_old.json")
|
||||
.getPath();
|
||||
|
||||
String publisherAffiliationRelationPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
|
||||
.getPath();
|
||||
|
||||
String publisherAffiliationRelationOldPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publichers_old")
|
||||
.getPath();
|
||||
|
||||
String outputPath = workingDir.toString() + "/actionSet";
|
||||
|
||||
PrepareAffiliationRelations
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-crossrefInputPath", crossrefAffiliationRelationPath,
|
||||
"-crossrefInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||
"-openapcInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-dataciteInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-webCrawlInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-publisherInputPath", publisherAffiliationRelationPath,
|
||||
"-outputPath", outputPath
|
||||
});
|
||||
|
||||
|
@ -99,13 +111,8 @@ public class PrepareAffiliationRelationsTest {
|
|||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
// for (Relation r : tmp.collect()) {
|
||||
// System.out.println(
|
||||
// r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
|
||||
// );
|
||||
// }
|
||||
// count the number of relations
|
||||
assertEquals(120, tmp.count());
|
||||
assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 =
|
||||
|
||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
dataset.createOrReplaceTempView("result");
|
||||
|
@ -116,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// verify that we have equal number of bi-directional relations
|
||||
Assertions
|
||||
.assertEquals(
|
||||
60, execVerification
|
||||
81, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||
.collectAsList()
|
||||
|
@ -124,26 +131,56 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
60, execVerification
|
||||
81, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||
.collectAsList()
|
||||
.size());
|
||||
|
||||
// check confidence value of a specific relation
|
||||
String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)";
|
||||
String sourceDOI = "10.1089/10872910260066679";
|
||||
|
||||
final String sourceOpenaireId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", sourceDOI));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.7071067812", execVerification
|
||||
"1.0", execVerification
|
||||
.filter(
|
||||
"source='" + sourceOpenaireId + "'")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(4));
|
||||
|
||||
final String publisherid = ID_PREFIX
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1089/10872910260066679"));
|
||||
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/05cf8a891");
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
4, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, execVerification
|
||||
.filter(
|
||||
"source = '" + ID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue("doi", "10.1007/s00217-010-1268-9"))
|
||||
+ "' and target = '" + "20|ror_________::"
|
||||
+ IdentifierFactory.md5("https://ror.org/03265fv13") + "'")
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, execVerification
|
||||
.filter(
|
||||
"source = '" + ID_PREFIX
|
||||
+ IdentifierFactory
|
||||
.md5(PidCleaner.normalizePidValue("doi", "10.1007/3-540-47984-8_14"))
|
||||
+ "' and target = '" + "20|ror_________::"
|
||||
+ IdentifierFactory.md5("https://ror.org/00a0n9e72") + "'")
|
||||
.count());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
public class CreateOpenCitationsASTest {
|
||||
|
||||
|
@ -280,17 +281,17 @@ public class CreateOpenCitationsASTest {
|
|||
@Test
|
||||
void testRelationsSourceTargetCouple() throws Exception {
|
||||
final String doi1 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
final String doi2 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
final String doi3 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
final String doi4 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
final String doi5 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
final String doi6 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
|
|
|
@ -0,0 +1,165 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest;
|
||||
import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateRAiDActionSetJobTest {
|
||||
private static String input_path;
|
||||
private static String output_path;
|
||||
static SparkSession spark;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
|
||||
input_path = Paths
|
||||
.get(
|
||||
GenerateRAiDActionSetJobTest.class
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json")
|
||||
.toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-")
|
||||
.toAbsolutePath()
|
||||
.toString();
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", output_path);
|
||||
conf.set("hive.metastore.warehouse.dir", output_path);
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(GenerateRAiDActionSetJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
static void cleanUp() throws Exception {
|
||||
FileUtils.deleteDirectory(new File(output_path));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void testProcessRAiDEntities() {
|
||||
GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set");
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<? extends Oaf> result = sc
|
||||
.sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(AtomicAction::getPayload);
|
||||
|
||||
assertEquals(80, result.count());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPrepareRAiD() {
|
||||
|
||||
List<AtomicAction<? extends Oaf>> atomicActions = GenerateRAiDActionSetJob
|
||||
.prepareRAiD(
|
||||
new RAiDEntity(
|
||||
"-92190526",
|
||||
Arrays
|
||||
.asList(
|
||||
"Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura",
|
||||
"Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume",
|
||||
"Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont",
|
||||
"Maïeul GRUGET", "Cécile Duchêne"),
|
||||
"2021-09-10",
|
||||
"2024-02-16",
|
||||
Arrays
|
||||
.asList(
|
||||
"cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps",
|
||||
"pan-scalar map", "Python library", "QGIS", "map design", "landmarks",
|
||||
"Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]",
|
||||
"[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography",
|
||||
"eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency",
|
||||
"General Medicine", "Geography, Planning and Development", "multi-scales",
|
||||
"pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences",
|
||||
"progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design",
|
||||
"cartography, map generalisation, zoom, multi-scale map", "Interactive maps",
|
||||
"Map generalisation", "Earth and Planetary Sciences (miscellaneous)",
|
||||
"Cartographic generalization", "rivers", "Benchmark", "General Environmental Science",
|
||||
"open source", "drawing", "Constraint", "Multi-scale maps"),
|
||||
Arrays
|
||||
.asList(
|
||||
"Where do people look at during multi-scale map tasks?", "FogDetector survey raw data",
|
||||
"Collection of cartographic disorientation stories", "Anchorwhat dataset",
|
||||
"BasqueRoads: A Benchmark for Road Network Selection",
|
||||
"Progressive river network selection for pan-scalar maps",
|
||||
"BasqueRoads, a dataset to benchmark road selection algorithms",
|
||||
"Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps",
|
||||
"Empirical approach to advance the generalisation of multi-scale maps",
|
||||
"L'Alpe d'Huez: a dataset to benchmark topographic map generalisation",
|
||||
"eye-tracking data from a survey on zooming in a pan-scalar map",
|
||||
"Material of the experiment 'More is Less' from the MapMuxing project",
|
||||
"Cartagen4py, an open source Python library for map generalisation",
|
||||
"L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"),
|
||||
Arrays
|
||||
.asList(
|
||||
"50|doi_dedup___::6915135e0aa39f913394513f809ae58a",
|
||||
"50|doi_dedup___::754e3c283639bc6e104c925ff3e34007",
|
||||
"50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0",
|
||||
"50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a",
|
||||
"50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153",
|
||||
"50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a",
|
||||
"50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13",
|
||||
"50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4",
|
||||
"50|doi_dedup___::a9bc4453273b2d02648a5cb453195042",
|
||||
"50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7",
|
||||
"50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5",
|
||||
"50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283",
|
||||
"50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea",
|
||||
"50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"),
|
||||
"Exploring Multi-Scale Map Generalization and Design",
|
||||
"This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval."));
|
||||
|
||||
OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload();
|
||||
Relation rel = (Relation) atomicActions.get(1).getPayload();
|
||||
|
||||
assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue());
|
||||
assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource());
|
||||
assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget());
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -28,6 +28,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
|
@ -270,17 +271,17 @@ public class CreateTAActionSetTest {
|
|||
@Test
|
||||
void testRelationsSourceTargetCouple() throws Exception {
|
||||
final String doi1 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
|
||||
final String doi2 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
|
||||
final String doi3 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
|
||||
final String doi4 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
|
||||
final String doi5 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
|
||||
final String doi6 = "50|doi_________::"
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
|
||||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.dhp.collection.plugin.csv;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
@Disabled
|
||||
class FileCsvCollectorPluginTest {
|
||||
|
||||
private FileCsvCollectorPlugin plugin;
|
||||
|
||||
@Test
|
||||
void testCollect() throws CollectorException, DocumentException, IOException {
|
||||
|
||||
this.plugin = new FileCsvCollectorPlugin(FileSystem.getLocal(new Configuration()));
|
||||
|
||||
final ApiDescriptor api = new ApiDescriptor();
|
||||
api.setBaseUrl("file:///tmp/test.csv");
|
||||
api.setProtocol(CollectorPlugin.NAME.fileCSV.name());
|
||||
api.getParams().put("header", "true");
|
||||
api.getParams().put("identifier", "0");
|
||||
api.getParams().put("separator", ",");
|
||||
api.getParams().put("quote", "\"");
|
||||
|
||||
final List<String> list = this.plugin.collect(api, null).toList();
|
||||
assertTrue(list.size() > 0);
|
||||
|
||||
for (final String xml : list) {
|
||||
System.out.println(xml);
|
||||
assertTrue(StringUtils.isNotBlank(xml));
|
||||
assertTrue(StringUtils.isNotBlank((DocumentHelper.parseText(xml).valueOf("//*[@isId='true']"))));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package eu.dnetlib.dhp.collection.plugin.csv;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@Disabled
|
||||
class HttpCsvCollectorPluginTest {
|
||||
|
||||
@Test
|
||||
void testCollect() {
|
||||
fail("Not yet implemented");
|
||||
}
|
||||
|
||||
}
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
|||
|
||||
class Gtr2PublicationsIteratorTest {
|
||||
|
||||
private static final String baseURL = "https://gtr.ukri.org/gtr/api";
|
||||
private static final String baseURL = "https://gtr.ukri.org/api";
|
||||
|
||||
private static final HttpClientParams clientParams = new HttpClientParams();
|
||||
|
||||
|
@ -34,7 +34,7 @@ class Gtr2PublicationsIteratorTest {
|
|||
@Test
|
||||
@Disabled
|
||||
public void testPaging() throws Exception {
|
||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "2", "2", clientParams);
|
||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "2", "3", clientParams);
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
Thread.sleep(300);
|
||||
|
@ -47,9 +47,9 @@ class Gtr2PublicationsIteratorTest {
|
|||
@Test
|
||||
@Disabled
|
||||
public void testOnePage() throws Exception {
|
||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "12", "12", clientParams);
|
||||
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "379", "380", clientParams);
|
||||
final int count = iterateAndCount(iterator);
|
||||
assertEquals(20, count);
|
||||
assertEquals(50, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -50,9 +50,10 @@ public class OsfPreprintsCollectorPluginTest {
|
|||
@Test
|
||||
@Disabled
|
||||
void test_one() throws CollectorException {
|
||||
this.plugin.collect(this.api, new AggregatorReport())
|
||||
.limit(1)
|
||||
.forEach(log::info);
|
||||
this.plugin
|
||||
.collect(this.api, new AggregatorReport())
|
||||
.limit(1)
|
||||
.forEach(log::info);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -95,7 +96,8 @@ public class OsfPreprintsCollectorPluginTest {
|
|||
final HttpConnector2 connector = new HttpConnector2();
|
||||
|
||||
try {
|
||||
final String res = connector.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
||||
final String res = connector
|
||||
.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
|
||||
System.out.println(res);
|
||||
fail();
|
||||
} catch (final Throwable e) {
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.zenodo;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class ZenodoPluginCollectionTest {
|
||||
|
||||
@Test
|
||||
public void testZenodoIterator() throws Exception {
|
||||
|
||||
final GZIPInputStream gis = new GZIPInputStream(
|
||||
getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/zenodo/zenodo.tar.gz"));
|
||||
try (ZenodoTarIterator it = new ZenodoTarIterator(gis)) {
|
||||
Assertions.assertTrue(it.hasNext());
|
||||
int i = 0;
|
||||
while (it.hasNext()) {
|
||||
Assertions.assertNotNull(it.next());
|
||||
i++;
|
||||
}
|
||||
Assertions.assertEquals(10, i);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,9 +1,10 @@
|
|||
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
|
||||
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
|
||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
||||
{"DOI":"10.1021\/ac020069k","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/01f5ytq51","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013846.72805.7e","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02pttbw34","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1161\/hy02t2.102992","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/00qqv6244","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/00p991c53","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1126\/science.1073633","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03xez1567","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/006w34k90","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1089\/10872910260066679","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/05cf8a891","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1108\/02656719610116117","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03mnm0t94","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/007tn5k56","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1080\/01443610050111986","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/001x4vz59","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/01tmqtf75","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1021\/cm020118+","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02cf1je33","Confidence":1,"Status":"inactive"},{"PID":"ROR","Value":"https:\/\/ror.org\/01hvx5h04","Confidence":1,"Status":"active"}]}
|
||||
{"DOI":"10.1161\/hc1202.104524","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/040r8fr65","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/04fctr677","Status":"active","Confidence":1}]}
|
||||
{"DOI":"10.1021\/ma011134f","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/04tj63d06","Status":"active","Confidence":1}]}
|
|
@ -0,0 +1,9 @@
|
|||
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
|
||||
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
|
||||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||
{"DOI": "https://doi.org/10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
|
@ -0,0 +1,6 @@
|
|||
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}, {"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
|
||||
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}, {"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue