forked from D-Net/dnet-hadoop
Merge branch beta into continuous_validation_2.
This commit is contained in:
commit
95335431e1
|
@ -27,3 +27,5 @@ spark-warehouse
|
|||
/**/.factorypath
|
||||
/**/.scalafmt.conf
|
||||
/.java-version
|
||||
/dhp-shade-package/dependency-reduced-pom.xml
|
||||
/**/job.properties
|
||||
|
|
|
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
|
|||
mojo.outputFile = testFolder;
|
||||
|
||||
// execute
|
||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
||||
try {
|
||||
mojo.execute();
|
||||
Assertions.assertTrue(false); // not reached
|
||||
} catch (Exception e) {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
|
||||
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -63,15 +63,14 @@
|
|||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-pace-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.sisyphsu</groupId>
|
||||
<artifactId>dateparser</artifactId>
|
||||
|
@ -161,7 +160,7 @@
|
|||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>${dhp-schemas.artifact}</artifactId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -170,4 +169,23 @@
|
|||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<!-- dependencies required on JDK9+ because J2EE has been removed -->
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>spark-34</id>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>javax.xml.bind</groupId>
|
||||
<artifactId>jaxb-api</artifactId>
|
||||
<version>2.2.11</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.sun.xml.ws</groupId>
|
||||
<artifactId>jaxws-ri</artifactId>
|
||||
<version>2.3.3</version>
|
||||
<type>pom</type>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</profile>
|
||||
</profiles>
|
||||
</project>
|
||||
|
|
|
@ -10,6 +10,11 @@ public class Constants {
|
|||
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
||||
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
|
||||
|
||||
public static final String RAID_NS_PREFIX = "raid________";
|
||||
|
||||
public static final String END_DATE = "endDate";
|
||||
public static final String START_DATE = "startDate";
|
||||
|
||||
public static final String ROR_NS_PREFIX = "ror_________";
|
||||
|
||||
public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222";
|
||||
|
|
|
@ -38,7 +38,7 @@ public class PacePerson {
|
|||
PacePerson.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
|
|||
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
||||
* concept_rec_id = 656930
|
||||
* @return response code
|
||||
* @throws IOException
|
||||
* @throws MissingConceptDoiException
|
||||
*/
|
||||
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
||||
setDepositionId(concept_rec_id, 1);
|
||||
|
|
|
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.commons.lang3.time.DateUtils;
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.joda.time.Instant;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -214,11 +212,11 @@ public class HttpConnector2 {
|
|||
.format(
|
||||
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
|
||||
MAPPER.writeValueAsString(report)));
|
||||
} catch (MalformedURLException | UnknownHostException e) {
|
||||
} catch (MalformedURLException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
throw new CollectorException(e.getMessage(), e);
|
||||
} catch (SocketTimeoutException | SocketException e) {
|
||||
} catch (SocketTimeoutException | SocketException | UnknownHostException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
report.put(e.getClass().getName(), e.getMessage());
|
||||
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.person;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Person;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class CoAuthorshipIterator implements Iterator<Relation> {
|
||||
private int firstIndex;
|
||||
private int secondIndex;
|
||||
private boolean firstRelation;
|
||||
private List<String> authors;
|
||||
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______::";
|
||||
private static final String OPENAIRE_PREFIX = "openaire____";
|
||||
private static final String SEPARATOR = "::";
|
||||
private static final String ORCID_KEY = "10|" + OPENAIRE_PREFIX + SEPARATOR
|
||||
+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
|
||||
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
|
||||
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return firstIndex < authors.size() - 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation next() {
|
||||
Relation rel = null;
|
||||
if (firstRelation) {
|
||||
rel = getRelation(authors.get(firstIndex), authors.get(secondIndex));
|
||||
firstRelation = Boolean.FALSE;
|
||||
} else {
|
||||
rel = getRelation(authors.get(secondIndex), authors.get(firstIndex));
|
||||
firstRelation = Boolean.TRUE;
|
||||
secondIndex += 1;
|
||||
if (secondIndex >= authors.size()) {
|
||||
firstIndex += 1;
|
||||
secondIndex = firstIndex + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return rel;
|
||||
}
|
||||
|
||||
public CoAuthorshipIterator(List<String> authors) {
|
||||
this.authors = authors;
|
||||
this.firstIndex = 0;
|
||||
this.secondIndex = 1;
|
||||
this.firstRelation = Boolean.TRUE;
|
||||
|
||||
}
|
||||
|
||||
private Relation getRelation(String orcid1, String orcid2) {
|
||||
String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
|
||||
String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
|
||||
Relation relation = OafMapperUtils
|
||||
.getRelation(
|
||||
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
|
||||
ModelConstants.PERSON_PERSON_SUBRELTYPE,
|
||||
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
|
||||
Arrays.asList(OafMapperUtils.keyValue(ORCID_KEY, ModelConstants.ORCID_DS)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.91"),
|
||||
null);
|
||||
relation.setValidated(true);
|
||||
return relation;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.person;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Coauthors implements Serializable {
|
||||
private List<String> coauthors;
|
||||
|
||||
public List<String> getCoauthors() {
|
||||
return coauthors;
|
||||
}
|
||||
|
||||
public void setCoauthors(List<String> coauthors) {
|
||||
this.coauthors = coauthors;
|
||||
}
|
||||
}
|
|
@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||
import com.wcohen.ss.JaroWinkler;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import scala.Tuple2;
|
||||
|
@ -145,110 +146,21 @@ public class AuthorMerger {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method tries to figure out when two author are the same in the contest
|
||||
* of ORCID enrichment
|
||||
*
|
||||
* @param left Author in the OAF entity
|
||||
* @param right Author ORCID
|
||||
* @return based on a heuristic on the names of the authors if they are the same.
|
||||
*/
|
||||
public static boolean checkORCIDSimilarity(final Author left, final Author right) {
|
||||
final Person pl = parse(left);
|
||||
final Person pr = parse(right);
|
||||
|
||||
// If one of them didn't have a surname we verify if they have the fullName not empty
|
||||
// and verify if the normalized version is equal
|
||||
if (!(pl.getSurname() != null && pl.getSurname().stream().anyMatch(StringUtils::isNotBlank) &&
|
||||
pr.getSurname() != null && pr.getSurname().stream().anyMatch(StringUtils::isNotBlank))) {
|
||||
|
||||
if (pl.getFullname() != null && !pl.getFullname().isEmpty() && pr.getFullname() != null
|
||||
&& !pr.getFullname().isEmpty()) {
|
||||
return pl
|
||||
.getFullname()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
fl -> pr.getFullname().stream().anyMatch(fr -> normalize(fl).equalsIgnoreCase(normalize(fr))));
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// The Authors have one surname in common
|
||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(sl)))) {
|
||||
|
||||
// If one of them has only a surname and is the same we can say that they are the same author
|
||||
if ((pl.getName() == null || pl.getName().stream().allMatch(StringUtils::isBlank)) ||
|
||||
(pr.getName() == null || pr.getName().stream().allMatch(StringUtils::isBlank)))
|
||||
return true;
|
||||
// The authors have the same initials of Name in common
|
||||
if (pl
|
||||
.getName()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
nl -> pr
|
||||
.getName()
|
||||
.stream()
|
||||
.anyMatch(nr -> nr.equalsIgnoreCase(nl))))
|
||||
return true;
|
||||
}
|
||||
|
||||
// Sometimes we noticed that publication have author wrote in inverse order Surname, Name
|
||||
// We verify if we have an exact match between name and surname
|
||||
if (pl.getSurname().stream().anyMatch(sl -> pr.getName().stream().anyMatch(nr -> nr.equalsIgnoreCase(sl))) &&
|
||||
pl.getName().stream().anyMatch(nl -> pr.getSurname().stream().anyMatch(sr -> sr.equalsIgnoreCase(nl))))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
//
|
||||
|
||||
/**
|
||||
* Method to enrich ORCID information in one list of authors based on another list
|
||||
*
|
||||
* @param baseAuthor the Author List in the OAF Entity
|
||||
* @param orcidAuthor The list of ORCID Author intersected
|
||||
* @return The Author List of the OAF Entity enriched with the orcid Author
|
||||
*/
|
||||
public static List<Author> enrichOrcid(List<Author> baseAuthor, List<Author> orcidAuthor) {
|
||||
|
||||
if (baseAuthor == null || baseAuthor.isEmpty())
|
||||
return orcidAuthor;
|
||||
|
||||
if (orcidAuthor == null || orcidAuthor.isEmpty())
|
||||
return baseAuthor;
|
||||
|
||||
if (baseAuthor.size() == 1 && orcidAuthor.size() > 10)
|
||||
return baseAuthor;
|
||||
|
||||
final List<Author> oAuthor = new ArrayList<>();
|
||||
oAuthor.addAll(orcidAuthor);
|
||||
|
||||
baseAuthor.forEach(ba -> {
|
||||
Optional<Author> aMatch = oAuthor.stream().filter(oa -> checkORCIDSimilarity(ba, oa)).findFirst();
|
||||
if (aMatch.isPresent()) {
|
||||
final Author sameAuthor = aMatch.get();
|
||||
addPid(ba, sameAuthor.getPid());
|
||||
oAuthor.remove(sameAuthor);
|
||||
}
|
||||
});
|
||||
return baseAuthor;
|
||||
}
|
||||
|
||||
private static void addPid(final Author a, final List<StructuredProperty> pids) {
|
||||
|
||||
if (a.getPid() == null) {
|
||||
a.setPid(new ArrayList<>());
|
||||
}
|
||||
|
||||
a.getPid().addAll(pids);
|
||||
|
||||
}
|
||||
|
||||
public static String pidToComparableString(StructuredProperty pid) {
|
||||
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
|
||||
: "";
|
||||
return (pid.getQualifier() != null ? classid : "")
|
||||
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
||||
final String classId = Optional
|
||||
.ofNullable(pid)
|
||||
.map(
|
||||
p -> Optional
|
||||
.ofNullable(p.getQualifier())
|
||||
.map(Qualifier::getClassid)
|
||||
.map(String::toLowerCase)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
return Optional
|
||||
.ofNullable(pid)
|
||||
.map(StructuredProperty::getValue)
|
||||
.map(v -> String.join("|", v, classId))
|
||||
.orElse("");
|
||||
}
|
||||
|
||||
public static int countAuthorsPids(List<Author> authors) {
|
||||
|
|
|
@ -2,8 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.col;
|
||||
import static org.apache.spark.sql.functions.when;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
@ -14,7 +13,7 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.ReduceFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -26,7 +25,7 @@ import eu.dnetlib.dhp.schema.common.EntityType;
|
|||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -135,10 +134,12 @@ public class GroupEntitiesSparkJob {
|
|||
.applyCoarVocabularies(entity, vocs),
|
||||
OAFENTITY_KRYO_ENC)
|
||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||
.reduceGroups((ReduceFunction<OafEntity>) OafMapperUtils::mergeEntities)
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, OafEntity, OafEntity>) (key, group) -> MergeUtils.mergeById(group, vocs),
|
||||
OAFENTITY_KRYO_ENC)
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, OafEntity>, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||
t._2().getClass().getName(), t._2()),
|
||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||
t.getClass().getName(), t),
|
||||
Encoders.tuple(Encoders.STRING(), OAFENTITY_KRYO_ENC));
|
||||
|
||||
// pivot on "_1" (classname of the entity)
|
||||
|
|
|
@ -65,7 +65,13 @@ public class RunSQLSparkJob {
|
|||
for (String statement : sql.split(";\\s*/\\*\\s*EOS\\s*\\*/\\s*")) {
|
||||
log.info("executing: {}", statement);
|
||||
long startTime = System.currentTimeMillis();
|
||||
spark.sql(statement).show();
|
||||
try {
|
||||
spark.sql(statement).show();
|
||||
} catch (Exception e) {
|
||||
log.error("Error executing statement: {}", statement, e);
|
||||
System.err.println("Error executing statement: " + statement + "\n" + e);
|
||||
throw e;
|
||||
}
|
||||
log
|
||||
.info(
|
||||
"executed in {}",
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2024.
|
||||
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.apache.commons.lang3.builder.EqualsBuilder;
|
||||
import org.apache.commons.lang3.builder.HashCodeBuilder;
|
||||
|
||||
public class HashableStructuredProperty extends StructuredProperty {
|
||||
|
||||
private static final long serialVersionUID = 8371670185221126045L;
|
||||
|
||||
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
}
|
||||
final HashableStructuredProperty sp = new HashableStructuredProperty();
|
||||
sp.setValue(value);
|
||||
sp.setQualifier(qualifier);
|
||||
sp.setDataInfo(dataInfo);
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
|
||||
HashableStructuredProperty hsp = new HashableStructuredProperty();
|
||||
hsp.setQualifier(sp.getQualifier());
|
||||
hsp.setValue(sp.getValue());
|
||||
hsp.setQualifier(sp.getQualifier());
|
||||
return hsp;
|
||||
}
|
||||
|
||||
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setQualifier(hsp.getQualifier());
|
||||
sp.setValue(hsp.getValue());
|
||||
sp.setQualifier(hsp.getQualifier());
|
||||
return sp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return new HashCodeBuilder(11, 91)
|
||||
.append(getQualifier().getClassid())
|
||||
.append(getQualifier().getSchemeid())
|
||||
.append(getValue())
|
||||
.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (obj == this) {
|
||||
return true;
|
||||
}
|
||||
if (obj.getClass() != getClass()) {
|
||||
return false;
|
||||
}
|
||||
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
|
||||
return new EqualsBuilder()
|
||||
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
|
||||
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
|
||||
.append(getValue(), rhs.getValue())
|
||||
.isEquals();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)";
|
||||
public static final String DOI_PREFIX = "10.";
|
||||
|
||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||
|
||||
static {
|
||||
PID_BLACKLIST.add("none");
|
||||
PID_BLACKLIST.add("na");
|
||||
}
|
||||
|
||||
public CleaningFunctions() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that filter PID values on a per-type basis.
|
||||
* @param s the PID whose value will be checked.
|
||||
* @return false if the pid matches the filter criteria, true otherwise.
|
||||
*/
|
||||
public static boolean pidFilter(StructuredProperty s) {
|
||||
final String pidValue = s.getValue();
|
||||
if (Objects.isNull(s.getQualifier()) ||
|
||||
StringUtils.isBlank(pidValue) ||
|
||||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
||||
return false;
|
||||
}
|
||||
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
||||
return false;
|
||||
}
|
||||
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,14 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class DoiCleaningRule {
|
||||
|
||||
public static String clean(final String doi) {
|
||||
return doi
|
||||
.toLowerCase()
|
||||
.replaceAll("\\s", "")
|
||||
if (doi == null)
|
||||
return null;
|
||||
final String replaced = doi
|
||||
.replaceAll("\\n|\\r|\\t|\\s", "")
|
||||
.replaceAll("^doi:", "")
|
||||
.toLowerCase()
|
||||
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
|
||||
if (StringUtils.isEmpty(replaced))
|
||||
return null;
|
||||
|
||||
if (!replaced.contains("10."))
|
||||
return null;
|
||||
|
||||
final String ret = replaced.substring(replaced.indexOf("10."));
|
||||
|
||||
if (!ret.startsWith(CleaningFunctions.DOI_PREFIX))
|
||||
return null;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -92,6 +91,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
INVALID_AUTHOR_NAMES.add("null anonymous");
|
||||
INVALID_AUTHOR_NAMES.add("unbekannt");
|
||||
INVALID_AUTHOR_NAMES.add("unknown");
|
||||
INVALID_AUTHOR_NAMES.add("autor, Sin");
|
||||
INVALID_AUTHOR_NAMES.add("Desconocido / Inconnu,");
|
||||
|
||||
INVALID_URL_HOSTS.add("creativecommons.org");
|
||||
INVALID_URL_HOSTS.add("www.academia.edu");
|
||||
|
@ -312,7 +313,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to evaluate here
|
||||
final Datasource d = (Datasource) value;
|
||||
return Objects.nonNull(d.getOfficialname()) && StringUtils.isNotBlank(d.getOfficialname().getValue());
|
||||
} else if (value instanceof Project) {
|
||||
final Project p = (Project) value;
|
||||
return Objects.nonNull(p.getCode()) && StringUtils.isNotBlank(p.getCode().getValue());
|
||||
|
@ -360,6 +362,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Person) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||
|
@ -505,6 +509,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.sorted((s1, s2) -> s2.getValue().length() - s1.getValue().length())
|
||||
.limit(ModelHardLimits.MAX_ABSTRACTS)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
|
@ -558,12 +564,24 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.ifPresent(pid -> {
|
||||
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
|
||||
final Set<HashableStructuredProperty> pids = pid
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::newInstance)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.ifPresent(altId -> {
|
||||
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
|
||||
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
|
||||
final Set<HashableStructuredProperty> altIds = altId
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::newInstance)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
i
|
||||
.setAlternateIdentifier(
|
||||
Sets
|
||||
.difference(altIds, pids)
|
||||
.stream()
|
||||
.map(HashableStructuredProperty::toStructuredProperty)
|
||||
.collect(Collectors.toList()));
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -677,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
}
|
||||
|
||||
// set ORCID_PENDING to all orcid values that are not coming from ORCID provenance
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
|
@ -733,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
|
||||
// Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result
|
||||
Map<String, Integer> clashing_orcid = new HashMap<>();
|
||||
|
||||
for (Author a : r.getAuthor()) {
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(
|
||||
p -> StringUtils
|
||||
.contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING))
|
||||
.map(StructuredProperty::getValue)
|
||||
.distinct()
|
||||
.forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1));
|
||||
}
|
||||
|
||||
Set<String> clashing = clashing_orcid
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(ee -> ee.getValue() > 1)
|
||||
.map(Map.Entry::getKey)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
// filter out clashing orcids
|
||||
for (Author a : r.getAuthor()) {
|
||||
a
|
||||
.setPid(
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(p -> !clashing.contains(p.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
|
@ -791,7 +844,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return author;
|
||||
}
|
||||
|
||||
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||
public static Optional<String> cleanDateField(Field<String> dateofacceptance) {
|
||||
return Optional
|
||||
.ofNullable(dateofacceptance)
|
||||
.map(Field::getValue)
|
||||
|
|
|
@ -0,0 +1,295 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.HashBiMap;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* Factory class for OpenAIRE identifiers in the Graph
|
||||
*/
|
||||
public class IdentifierFactory implements Serializable {
|
||||
|
||||
public static final String ID_SEPARATOR = "::";
|
||||
public static final String ID_PREFIX_SEPARATOR = "|";
|
||||
|
||||
public static final int ID_PREFIX_LEN = 12;
|
||||
|
||||
/**
|
||||
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE.
|
||||
* The id of the record (source_::id) will be rewritten as pidType_::id)
|
||||
*/
|
||||
public static final Map<PidType, HashBiMap<String, String>> PID_AUTHORITY = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
PID_AUTHORITY.put(PidType.doi, HashBiMap.create());
|
||||
PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref");
|
||||
PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite");
|
||||
PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "ZENODO");
|
||||
PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "Zenodo");
|
||||
|
||||
PID_AUTHORITY.put(PidType.pmc, HashBiMap.create());
|
||||
PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
|
||||
PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central");
|
||||
|
||||
PID_AUTHORITY.put(PidType.pmid, HashBiMap.create());
|
||||
PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
|
||||
PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central");
|
||||
|
||||
PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create());
|
||||
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive");
|
||||
|
||||
PID_AUTHORITY.put(PidType.w3id, HashBiMap.create());
|
||||
PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ROHub");
|
||||
}
|
||||
|
||||
/**
|
||||
* Declares the associations PID_TYPE -> [DATASOURCE ID, PID SUBSTRING] considered as delegated authority for that
|
||||
* PID_TYPE. Example, Zenodo is delegated to forge DOIs that contain the 'zenodo' word.
|
||||
*
|
||||
* If a record with the same id (same pid) comes from 2 data sources, the one coming from a delegated source wins. E.g. Zenodo records win over those from Datacite.
|
||||
* See also https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/187 and the class dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java
|
||||
*/
|
||||
public static final Map<PidType, Map<String, String>> DELEGATED_PID_AUTHORITY = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
DELEGATED_PID_AUTHORITY.put(PidType.doi, new HashMap<>());
|
||||
DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "zenodo");
|
||||
DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "zenodo");
|
||||
DELEGATED_PID_AUTHORITY.put(PidType.w3id, new HashMap<>());
|
||||
DELEGATED_PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ro-id");
|
||||
}
|
||||
|
||||
/**
|
||||
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] whose records are considered enrichment for the graph.
|
||||
* Their OpenAIRE ID is built from the declared PID type. Are merged with their corresponding record, identified by
|
||||
* the same OpenAIRE id.
|
||||
*/
|
||||
public static final Map<PidType, HashBiMap<String, String>> ENRICHMENT_PROVIDER = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
ENRICHMENT_PROVIDER.put(PidType.doi, HashBiMap.create());
|
||||
ENRICHMENT_PROVIDER.get(PidType.doi).put(OPEN_APC_ID, OPEN_APC_NAME);
|
||||
}
|
||||
|
||||
public static Set<String> delegatedAuthorityDatasourceIds() {
|
||||
return DELEGATED_PID_AUTHORITY
|
||||
.values()
|
||||
.stream()
|
||||
.flatMap(m -> m.keySet().stream())
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
|
||||
return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static <T extends Result> String createDOIBoostIdentifier(T entity) {
|
||||
if (entity == null)
|
||||
return null;
|
||||
|
||||
StructuredProperty pid = null;
|
||||
if (entity.getPid() != null) {
|
||||
pid = entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(s -> s.getQualifier() != null && "doi".equalsIgnoreCase(s.getQualifier().getClassid()))
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.findAny()
|
||||
.orElse(null);
|
||||
} else {
|
||||
if (entity.getInstance() != null) {
|
||||
pid = entity
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getPid() != null)
|
||||
.flatMap(i -> i.getPid().stream())
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.findAny()
|
||||
.orElse(null);
|
||||
}
|
||||
}
|
||||
if (pid != null)
|
||||
return idFromPid(entity, pid, true);
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given
|
||||
* entity T. Returns entity.id when none of the PIDs meet the selection criteria is available.
|
||||
*
|
||||
* @param entity the entity providing PIDs and a default ID.
|
||||
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
||||
* @param md5 indicates whether should hash the PID value or not.
|
||||
* @return an identifier from the most relevant PID, entity.id otherwise
|
||||
*/
|
||||
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
|
||||
|
||||
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
|
||||
|
||||
final Map<String, Set<StructuredProperty>> pids = extractPids(entity);
|
||||
|
||||
return pids
|
||||
.values()
|
||||
.stream()
|
||||
.flatMap(Set::stream)
|
||||
.min(new PidComparator<>(entity))
|
||||
.map(
|
||||
min -> Optional
|
||||
.ofNullable(pids.get(min.getQualifier().getClassid()))
|
||||
.map(
|
||||
p -> p
|
||||
.stream()
|
||||
.sorted(new PidValueComparator())
|
||||
.findFirst()
|
||||
.map(s -> idFromPid(entity, s, md5))
|
||||
.orElseGet(entity::getId))
|
||||
.orElseGet(entity::getId))
|
||||
.orElseGet(entity::getId);
|
||||
}
|
||||
|
||||
private static <T extends OafEntity> Map<String, Set<StructuredProperty>> extractPids(T entity) {
|
||||
if (entity instanceof Result) {
|
||||
return Optional
|
||||
.ofNullable(((Result) entity).getInstance())
|
||||
.map(IdentifierFactory::mapPids)
|
||||
.orElse(new HashMap<>());
|
||||
} else {
|
||||
return entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(CleaningFunctions::pidFilter)
|
||||
.collect(
|
||||
Collectors
|
||||
.groupingBy(
|
||||
p -> p.getQualifier().getClassid(),
|
||||
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, Set<StructuredProperty>> mapPids(List<Instance> instance) {
|
||||
return instance
|
||||
.stream()
|
||||
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false))
|
||||
.flatMap(Function.identity())
|
||||
.collect(
|
||||
Collectors
|
||||
.groupingBy(
|
||||
p -> p.getQualifier().getClassid(),
|
||||
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
|
||||
}
|
||||
|
||||
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom,
|
||||
boolean mapHandles) {
|
||||
return Optional
|
||||
.ofNullable(pid)
|
||||
.map(
|
||||
pp -> pp
|
||||
.stream()
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||
// given PID Type
|
||||
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
|
||||
.map(PidCleaner::normalizePidValue)
|
||||
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
|
||||
.filter(CleaningFunctions::pidFilter))
|
||||
.orElse(Stream.empty());
|
||||
}
|
||||
|
||||
private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) {
|
||||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
||||
|
||||
if (Objects.isNull(collectedFrom)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
boolean isEnrich = Optional
|
||||
.ofNullable(ENRICHMENT_PROVIDER.get(pType))
|
||||
.map(
|
||||
enrich -> enrich.containsKey(collectedFrom.getKey())
|
||||
|| enrich.containsValue(collectedFrom.getValue()))
|
||||
.orElse(false);
|
||||
|
||||
boolean isAuthority = Optional
|
||||
.ofNullable(PID_AUTHORITY.get(pType))
|
||||
.map(
|
||||
authorities -> authorities.containsKey(collectedFrom.getKey())
|
||||
|| authorities.containsValue(collectedFrom.getValue()))
|
||||
.orElse(false);
|
||||
|
||||
return (mapHandles && pType.equals(PidType.handle)) || isEnrich || isAuthority;
|
||||
}
|
||||
|
||||
private static boolean isNotFromDelegatedAuthority(KeyValue collectedFrom, StructuredProperty p) {
|
||||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
||||
|
||||
final Map<String, String> da = DELEGATED_PID_AUTHORITY.get(pType);
|
||||
if (Objects.isNull(da)) {
|
||||
return true;
|
||||
}
|
||||
if (!da.containsKey(collectedFrom.getKey())) {
|
||||
return true;
|
||||
}
|
||||
return StringUtils.contains(p.getValue(), da.get(collectedFrom.getKey()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
|
||||
*/
|
||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||
|
||||
return createIdentifier(entity, true);
|
||||
}
|
||||
|
||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
||||
return idFromPid(ModelSupport.getIdPrefix(entity.getClass()), s.getQualifier().getClassid(), s.getValue(), md5);
|
||||
}
|
||||
|
||||
public static String idFromPid(String numericPrefix, String pidType, String pidValue, boolean md5) {
|
||||
return new StringBuilder()
|
||||
.append(numericPrefix)
|
||||
.append(ID_PREFIX_SEPARATOR)
|
||||
.append(createPrefix(pidType))
|
||||
.append(ID_SEPARATOR)
|
||||
.append(md5 ? md5(pidValue) : pidValue)
|
||||
.toString();
|
||||
}
|
||||
|
||||
// create the prefix (length = 12)
|
||||
private static String createPrefix(String pidType) {
|
||||
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
|
||||
while (prefix.length() < ID_PREFIX_LEN) {
|
||||
prefix.append("_");
|
||||
}
|
||||
return prefix.substring(0, ID_PREFIX_LEN);
|
||||
}
|
||||
|
||||
public static String md5(final String s) {
|
||||
try {
|
||||
final MessageDigest md = MessageDigest.getInstance("MD5");
|
||||
md.update(s.getBytes(StandardCharsets.UTF_8));
|
||||
return new String(Hex.encodeHex(md.digest()));
|
||||
} catch (final Exception e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
//
|
||||
// Source code recreated from a .class file by IntelliJ IDEA
|
||||
// (powered by FernFlower decompiler)
|
||||
//
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class MergeComparator implements Comparator<Oaf> {
|
||||
public MergeComparator() {
|
||||
}
|
||||
|
||||
public int compare(Oaf left, Oaf right) {
|
||||
// nulls at the end
|
||||
if (left == null && right == null) {
|
||||
return 0;
|
||||
} else if (left == null) {
|
||||
return -1;
|
||||
} else if (right == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// invisible
|
||||
if (left.getDataInfo() != null && left.getDataInfo().getInvisible() == true) {
|
||||
if (right.getDataInfo() != null && right.getDataInfo().getInvisible() == false) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// collectedfrom
|
||||
HashSet<String> lCf = getCollectedFromIds(left);
|
||||
HashSet<String> rCf = getCollectedFromIds(right);
|
||||
if (lCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
||||
&& !rCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")) {
|
||||
return -1;
|
||||
} else if (!lCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")
|
||||
&& rCf.contains("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
SubEntityType lClass = SubEntityType.fromClass(left.getClass());
|
||||
SubEntityType rClass = SubEntityType.fromClass(right.getClass());
|
||||
return lClass.ordinal() - rClass.ordinal();
|
||||
|
||||
}
|
||||
|
||||
protected HashSet<String> getCollectedFromIds(Oaf left) {
|
||||
return (HashSet) Optional.ofNullable(left.getCollectedfrom()).map((cf) -> {
|
||||
return (HashSet) cf.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
|
||||
}).orElse(new HashSet());
|
||||
}
|
||||
|
||||
enum SubEntityType {
|
||||
publication, dataset, software, otherresearchproduct, datasource, organization, project;
|
||||
|
||||
/**
|
||||
* Resolves the EntityType, given the relative class name
|
||||
*
|
||||
* @param clazz the given class name
|
||||
* @param <T> actual OafEntity subclass
|
||||
* @return the EntityType associated to the given class
|
||||
*/
|
||||
public static <T extends Oaf> SubEntityType fromClass(Class<T> clazz) {
|
||||
return valueOf(clazz.getSimpleName().toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class MergeEntitiesComparator implements Comparator<Oaf> {
|
||||
static final List<String> PID_AUTHORITIES = Arrays
|
||||
.asList(
|
||||
ModelConstants.ARXIV_ID,
|
||||
ModelConstants.PUBMED_CENTRAL_ID,
|
||||
ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
|
||||
ModelConstants.DATACITE_ID,
|
||||
ModelConstants.CROSSREF_ID);
|
||||
|
||||
static final List<String> RESULT_TYPES = Arrays
|
||||
.asList(
|
||||
ModelConstants.ORP_RESULTTYPE_CLASSID,
|
||||
ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
|
||||
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
|
||||
public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
|
||||
|
||||
@Override
|
||||
public int compare(Oaf left, Oaf right) {
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return -1;
|
||||
if (right == null)
|
||||
return 1;
|
||||
|
||||
int res = 0;
|
||||
|
||||
// pid authority
|
||||
int cfp1 = Optional
|
||||
.ofNullable(left.getCollectedfrom())
|
||||
.map(
|
||||
cf -> cf
|
||||
.stream()
|
||||
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
|
||||
.max(Integer::compare)
|
||||
.orElse(-1))
|
||||
.orElse(-1);
|
||||
int cfp2 = Optional
|
||||
.ofNullable(right.getCollectedfrom())
|
||||
.map(
|
||||
cf -> cf
|
||||
.stream()
|
||||
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
|
||||
.max(Integer::compare)
|
||||
.orElse(-1))
|
||||
.orElse(-1);
|
||||
|
||||
if (cfp1 >= 0 && cfp1 > cfp2) {
|
||||
return 1;
|
||||
} else if (cfp2 >= 0 && cfp2 > cfp1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// trust
|
||||
if (left.getDataInfo() != null && right.getDataInfo() != null) {
|
||||
res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
|
||||
}
|
||||
|
||||
// result type
|
||||
if (res == 0) {
|
||||
if (left instanceof Result && right instanceof Result) {
|
||||
Result r1 = (Result) left;
|
||||
Result r2 = (Result) right;
|
||||
|
||||
if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
|
||||
if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
|
||||
return -1;
|
||||
}
|
||||
} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
|
||||
int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
|
||||
|
||||
if (rt1 >= 0 && rt1 > rt2) {
|
||||
return 1;
|
||||
} else if (rt2 >= 0 && rt2 > rt1) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// id
|
||||
if (res == 0) {
|
||||
if (left instanceof OafEntity && right instanceof OafEntity) {
|
||||
res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId());
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,40 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
||||
public class ModelHardLimits {
|
||||
|
||||
private ModelHardLimits() {
|
||||
}
|
||||
|
||||
public static final String LAYOUT = "index";
|
||||
public static final String INTERPRETATION = "openaire";
|
||||
public static final String SEPARATOR = "-";
|
||||
|
||||
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
||||
public static final int MAX_AUTHORS = 200;
|
||||
public static final int MAX_RELATED_AUTHORS = 20;
|
||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||
public static final int MAX_TITLE_LENGTH = 5000;
|
||||
public static final int MAX_TITLES = 10;
|
||||
public static final int MAX_ABSTRACTS = 10;
|
||||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||
public static final int MAX_RELATED_ABSTRACT_LENGTH = 500;
|
||||
public static final int MAX_INSTANCES = 10;
|
||||
public static final Map<String, Long> MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500L);
|
||||
MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500L);
|
||||
}
|
||||
|
||||
public static String getCollectionName(String format) {
|
||||
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
||||
}
|
||||
|
||||
}
|
|
@ -14,7 +14,6 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class OafMapperUtils {
|
||||
|
@ -22,65 +21,6 @@ public class OafMapperUtils {
|
|||
private OafMapperUtils() {
|
||||
}
|
||||
|
||||
public static Oaf merge(final Oaf left, final Oaf right) {
|
||||
if (ModelSupport.isSubClass(left, OafEntity.class)) {
|
||||
return mergeEntities((OafEntity) left, (OafEntity) right);
|
||||
} else if (ModelSupport.isSubClass(left, Relation.class)) {
|
||||
((Relation) left).mergeFrom((Relation) right);
|
||||
} else {
|
||||
throw new IllegalArgumentException("invalid Oaf type:" + left.getClass().getCanonicalName());
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
|
||||
if (ModelSupport.isSubClass(left, Result.class)) {
|
||||
return mergeResults((Result) left, (Result) right);
|
||||
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else if (ModelSupport.isSubClass(left, Organization.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else if (ModelSupport.isSubClass(left, Project.class)) {
|
||||
left.mergeFrom(right);
|
||||
} else {
|
||||
throw new IllegalArgumentException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
public static Result mergeResults(Result left, Result right) {
|
||||
|
||||
final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left);
|
||||
final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right);
|
||||
|
||||
if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) {
|
||||
return left;
|
||||
}
|
||||
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
|
||||
return right;
|
||||
}
|
||||
|
||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||
left.mergeFrom(right);
|
||||
return left;
|
||||
} else {
|
||||
right.mergeFrom(left);
|
||||
return right;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isFromDelegatedAuthority(Result r) {
|
||||
return Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.filter(i -> Objects.nonNull(i.getCollectedfrom()))
|
||||
.map(i -> i.getCollectedfrom().getKey())
|
||||
.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
|
||||
.orElse(false);
|
||||
}
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey(k);
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
if (left == null) {
|
||||
return right == null ? 0 : -1;
|
||||
} else if (right == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(PidType.openorgs))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.openorgs))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.GRID))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.GRID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.mag_id))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.mag_id))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.urn))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class PidBlacklist extends HashMap<String, HashSet<String>> {
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class PidBlacklistProvider {
|
||||
|
||||
private static final PidBlacklist blacklist;
|
||||
|
||||
static {
|
||||
try {
|
||||
String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json"));
|
||||
blacklist = new ObjectMapper().readValue(json, PidBlacklist.class);
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static PidBlacklist getBlacklist() {
|
||||
return blacklist;
|
||||
}
|
||||
|
||||
public static Set<String> getBlacklist(String pidType) {
|
||||
return Optional
|
||||
.ofNullable(getBlacklist().get(pidType))
|
||||
.orElse(new HashSet<>());
|
||||
}
|
||||
|
||||
private PidBlacklistProvider() {
|
||||
}
|
||||
|
||||
}
|
|
@ -26,7 +26,7 @@ public class PidCleaner {
|
|||
String value = Optional
|
||||
.ofNullable(pidValue)
|
||||
.map(String::trim)
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||
.orElseThrow(() -> new IllegalArgumentException("PID (" + pidType + ") value cannot be empty"));
|
||||
|
||||
switch (pidType) {
|
||||
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
|
||||
|
||||
private final T entity;
|
||||
|
||||
public PidComparator(T entity) {
|
||||
this.entity = entity;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
||||
return compareResultPids(left, right);
|
||||
}
|
||||
if (ModelSupport.isSubClass(entity, Organization.class)) {
|
||||
return compareOrganizationtPids(left, right);
|
||||
}
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid());
|
||||
}
|
||||
|
||||
private int compareResultPids(StructuredProperty left, StructuredProperty right) {
|
||||
return new ResultPidComparator().compare(left, right);
|
||||
}
|
||||
|
||||
private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) {
|
||||
return new OrganizationPidComparator().compare(left, right);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import org.apache.commons.lang3.EnumUtils;
|
||||
|
||||
public enum PidType {
|
||||
|
||||
/**
|
||||
* The DOI syntax shall be made up of a DOI prefix and a DOI suffix separated by a forward slash.
|
||||
*
|
||||
* There is no defined limit on the length of the DOI name, or of the DOI prefix or DOI suffix.
|
||||
*
|
||||
* The DOI name is case-insensitive and can incorporate any printable characters from the legal graphic characters
|
||||
* of Unicode. Further constraints on character use (e.g. use of language-specific alphanumeric characters) can be
|
||||
* defined for an application by the ISO 26324 Registration Authority.
|
||||
*
|
||||
*
|
||||
* DOI prefix: The DOI prefix shall be composed of a directory indicator followed by a registrant code.
|
||||
* These two components shall be separated by a full stop (period). The directory indicator shall be "10" and
|
||||
* distinguishes the entire set of character strings (prefix and suffix) as digital object identifiers within the
|
||||
* resolution system.
|
||||
*
|
||||
* Registrant code: The second element of the DOI prefix shall be the registrant code. The registrant code is a
|
||||
* unique string assigned to a registrant.
|
||||
*
|
||||
* DOI suffix: The DOI suffix shall consist of a character string of any length chosen by the registrant.
|
||||
* Each suffix shall be unique to the prefix element that precedes it. The unique suffix can be a sequential number,
|
||||
* or it might incorporate an identifier generated from or based on another system used by the registrant
|
||||
* (e.g. ISAN, ISBN, ISRC, ISSN, ISTC, ISNI; in such cases, a preferred construction for such a suffix can be
|
||||
* specified, as in Example 1).
|
||||
*
|
||||
* Source: https://www.doi.org/doi_handbook/2_Numbering.html#2.2
|
||||
*/
|
||||
doi,
|
||||
|
||||
/**
|
||||
* PubMed Unique Identifier (PMID)
|
||||
*
|
||||
* This field is a 1-to-8 digit accession number with no leading zeros. It is present on all records and is the
|
||||
* accession number for managing and disseminating records. PMIDs are not reused after records are deleted.
|
||||
*
|
||||
* Beginning in February 2012 PMIDs include extensions following a decimal point to account for article versions
|
||||
* (e.g., 21804956.2). All citations are considered version 1 until replaced. The extended PMID is not displayed
|
||||
* on the MEDLINE format.
|
||||
*
|
||||
* View the citation in abstract format in PubMed to access additional versions when available (see the article in
|
||||
* the Jan-Feb 2012 NLM Technical Bulletin).
|
||||
*
|
||||
* Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmid
|
||||
*/
|
||||
pmid,
|
||||
|
||||
/**
|
||||
* This field contains the unique identifier for the cited article in PubMed Central. The identifier begins with the
|
||||
* prefix PMC.
|
||||
*
|
||||
* Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmc
|
||||
*/
|
||||
pmc, handle, arXiv, nct, pdb, w3id,
|
||||
|
||||
// Organization
|
||||
openorgs, ROR, GRID, PIC, ISNI, Wikidata, FundRef, corda, corda_h2020, mag_id, urn,
|
||||
|
||||
// Used by dedup
|
||||
undefined, original;
|
||||
|
||||
public static boolean isValid(String type) {
|
||||
return EnumUtils.isValidEnum(PidType.class, type);
|
||||
}
|
||||
|
||||
public static PidType tryValueOf(String s) {
|
||||
try {
|
||||
return PidType.valueOf(s);
|
||||
} catch (Exception e) {
|
||||
return PidType.original;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class PidValueComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
StructuredProperty l = PidCleaner.normalizePidValue(left);
|
||||
StructuredProperty r = PidCleaner.normalizePidValue(right);
|
||||
|
||||
return Optional
|
||||
.ofNullable(l.getValue())
|
||||
.map(
|
||||
lv -> Optional
|
||||
.ofNullable(r.getValue())
|
||||
.map(rv -> lv.compareTo(rv))
|
||||
.orElse(-1))
|
||||
.orElse(1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
||||
/**
|
||||
* Comparator for sorting the values from the dnet:review_levels vocabulary, implements the following ordering
|
||||
*
|
||||
* peerReviewed (0001) > nonPeerReviewed (0002) > UNKNOWN (0000)
|
||||
*/
|
||||
public class RefereedComparator implements Comparator<Qualifier> {
|
||||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
if (left == null || left.getClassid() == null) {
|
||||
return (right == null || right.getClassid() == null) ? 0 : -1;
|
||||
} else if (right == null || right.getClassid() == null) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if ("0001".equals(lClass))
|
||||
return -1;
|
||||
if ("0001".equals(rClass))
|
||||
return 1;
|
||||
|
||||
if ("0002".equals(lClass))
|
||||
return -1;
|
||||
if ("0002".equals(rClass))
|
||||
return 1;
|
||||
|
||||
if ("0000".equals(lClass))
|
||||
return -1;
|
||||
if ("0000".equals(rClass))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class ResultPidComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(PidType.doi))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.doi))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.pmid))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.pmid))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.pmc))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.pmc))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.handle))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.handle))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.arXiv))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.arXiv))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.nct))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.nct))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(PidType.pdb))
|
||||
return -1;
|
||||
if (rClass.equals(PidType.pdb))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -28,6 +28,7 @@ import com.jayway.jsonpath.JsonPath;
|
|||
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||
import net.minidev.json.JSONArray;
|
||||
import scala.collection.JavaConverters;
|
||||
import scala.collection.Seq;
|
||||
|
@ -104,7 +105,7 @@ public class DHPUtils {
|
|||
|
||||
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
|
||||
|
||||
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
|
||||
final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid);
|
||||
|
||||
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public class PaceCommonUtils {
|
||||
|
||||
// transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||
});
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
} catch (Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
public static String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public static String unicodeNormalization(final String s) {
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
}
|
|
@ -12,7 +12,7 @@ import com.google.common.collect.Iterables;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.common.PaceCommonUtils;
|
||||
import eu.dnetlib.pace.util.Capitalise;
|
||||
import eu.dnetlib.pace.util.DotAbbreviations;
|
||||
|
||||
|
@ -86,7 +86,7 @@ public class Person {
|
|||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
|
@ -15,4 +15,4 @@ public class Capitalise implements Function<String, String> {
|
|||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
};
|
||||
}
|
|
@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
|
|||
public String apply(String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
};
|
||||
}
|
|
@ -154,5 +154,13 @@
|
|||
"unknown":{
|
||||
"original":"Unknown",
|
||||
"inverse":"Unknown"
|
||||
},
|
||||
"isamongtopnsimilardocuments": {
|
||||
"original": "IsAmongTopNSimilarDocuments",
|
||||
"inverse": "HasAmongTopNSimilarDocuments"
|
||||
},
|
||||
"hasamongtopnsimilardocuments": {
|
||||
"original": "HasAmongTopNSimilarDocuments",
|
||||
"inverse": "IsAmongTopNSimilarDocuments"
|
||||
}
|
||||
}
|
|
@ -1,5 +1,8 @@
|
|||
package eu.dnetlib.dhp.application
|
||||
|
||||
import eu.dnetlib.dhp.common.Constants
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
/** This is the main Interface SparkApplication
|
||||
|
@ -62,12 +65,22 @@ abstract class AbstractScalaApplication(
|
|||
val conf: SparkConf = new SparkConf()
|
||||
val master = parser.get("master")
|
||||
log.info(s"Creating Spark session: Master: $master")
|
||||
SparkSession
|
||||
val b = SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
if (master != null)
|
||||
b.master(master)
|
||||
b.getOrCreate()
|
||||
}
|
||||
|
||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||
val total_items = spark.read.text(targetPath).count()
|
||||
writeHdfsFile(
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
s"$total_items",
|
||||
outputBasePath + Constants.MDSTORE_SIZE_PATH
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
|
|||
}
|
||||
|
||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||
val sum = ScholixUtils.resultToSummary(r)
|
||||
if (sum != null)
|
||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||
else
|
||||
null
|
||||
}
|
||||
|
||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||
|
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
|
|||
|
||||
}
|
||||
|
||||
def invRel(rel: String): String = {
|
||||
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
|
||||
if (semanticRelation != null)
|
||||
semanticRelation.inverse
|
||||
else
|
||||
null
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||
|
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
|
|||
if (persistentIdentifiers.isEmpty)
|
||||
return null
|
||||
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
||||
if (r.isInstanceOf[Publication])
|
||||
s.setTypology(Typology.publication)
|
||||
else
|
||||
s.setTypology(Typology.dataset)
|
||||
// s.setTypology(r.getResulttype.getClassid)
|
||||
|
||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class BlackListProviderTest {
|
||||
|
||||
@Test
|
||||
void blackListTest() {
|
||||
|
||||
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist());
|
||||
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi"));
|
||||
Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0);
|
||||
final Set<String> xxx = PidBlacklistProvider.getBlacklist("xxx");
|
||||
Assertions.assertNotNull(xxx);
|
||||
Assertions.assertEquals(0, xxx.size());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
||||
class IdentifierFactoryTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Test
|
||||
void testCreateIdentifierForPublication() throws IOException {
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi1.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_doi5.json", "50|doi_________::3bef95c0ca26dd55451fc8839ea69d27", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
|
||||
|
||||
verifyIdentifier(
|
||||
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
||||
|
||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||
verifyIdentifier("publication_3.json", defaultID, true);
|
||||
verifyIdentifier("publication_4.json", defaultID, true);
|
||||
verifyIdentifier("publication_5.json", defaultID, true);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCreateIdentifierForPublicationNoHash() throws IOException {
|
||||
|
||||
verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
|
||||
verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
|
||||
verifyIdentifier("publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
|
||||
verifyIdentifier(
|
||||
"publication_urn1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
|
||||
|
||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||
verifyIdentifier("publication_3.json", defaultID, false);
|
||||
verifyIdentifier("publication_4.json", defaultID, false);
|
||||
verifyIdentifier("publication_5.json", defaultID, false);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCreateIdentifierForROHub() throws IOException {
|
||||
verifyIdentifier(
|
||||
"orp-rohub.json", "50|w3id________::afc7592914ae190a50570db90f55f9c2", true);
|
||||
}
|
||||
|
||||
protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
||||
|
||||
String id = IdentifierFactory.createIdentifier(pub, md5);
|
||||
System.out.println(id);
|
||||
assertNotNull(id);
|
||||
assertEquals(expectedID, id);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.beanutils.BeanUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class MergeUtilsTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Test
|
||||
void testMergePubs_new() throws IOException {
|
||||
Publication pt = read("publication_test.json", Publication.class);
|
||||
Publication p1 = read("publication_test.json", Publication.class);
|
||||
|
||||
assertEquals(1, pt.getCollectedfrom().size());
|
||||
assertEquals(ModelConstants.CROSSREF_ID, pt.getCollectedfrom().get(0).getKey());
|
||||
|
||||
Instance i = new Instance();
|
||||
i.setUrl(Lists.newArrayList("https://..."));
|
||||
p1.getInstance().add(i);
|
||||
|
||||
Publication ptp1 = MergeUtils.mergePublication(pt, p1);
|
||||
|
||||
assertNotNull(ptp1.getInstance());
|
||||
assertEquals(2, ptp1.getInstance().size());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMergePubs() throws IOException {
|
||||
Publication p1 = read("publication_1.json", Publication.class);
|
||||
Publication p2 = read("publication_2.json", Publication.class);
|
||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||
Dataset d2 = read("dataset_2.json", Dataset.class);
|
||||
|
||||
assertEquals(1, p1.getCollectedfrom().size());
|
||||
assertEquals(ModelConstants.CROSSREF_ID, p1.getCollectedfrom().get(0).getKey());
|
||||
assertEquals(1, d2.getCollectedfrom().size());
|
||||
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
assertEquals(1, p2.getCollectedfrom().size());
|
||||
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
assertEquals(1, d1.getCollectedfrom().size());
|
||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||
|
||||
final Result p1d2 = MergeUtils.checkedMerge(p1, d2, true);
|
||||
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
|
||||
assertTrue(p1d2 instanceof Publication);
|
||||
assertEquals(p1.getId(), p1d2.getId());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMergePubs_1() throws IOException {
|
||||
Publication p2 = read("publication_2.json", Publication.class);
|
||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||
|
||||
final Result p2d1 = MergeUtils.checkedMerge(p2, d1, true);
|
||||
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
|
||||
assertTrue(p2d1 instanceof Dataset);
|
||||
assertEquals(d1.getId(), p2d1.getId());
|
||||
assertEquals(2, p2d1.getCollectedfrom().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMergePubs_2() throws IOException {
|
||||
Publication p1 = read("publication_1.json", Publication.class);
|
||||
Publication p2 = read("publication_2.json", Publication.class);
|
||||
|
||||
Result p1p2 = MergeUtils.checkedMerge(p1, p2, true);
|
||||
assertTrue(p1p2 instanceof Publication);
|
||||
assertEquals(p1.getId(), p1p2.getId());
|
||||
assertEquals(2, p1p2.getCollectedfrom().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDelegatedAuthority_1() throws IOException {
|
||||
Dataset d1 = read("dataset_2.json", Dataset.class);
|
||||
Dataset d2 = read("dataset_delegated.json", Dataset.class);
|
||||
|
||||
assertEquals(1, d2.getCollectedfrom().size());
|
||||
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
|
||||
|
||||
Result res = (Result) MergeUtils.merge(d1, d2, true);
|
||||
|
||||
assertEquals(d2, res);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDelegatedAuthority_2() throws IOException {
|
||||
Dataset p1 = read("publication_1.json", Dataset.class);
|
||||
Dataset d2 = read("dataset_delegated.json", Dataset.class);
|
||||
|
||||
assertEquals(1, d2.getCollectedfrom().size());
|
||||
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
|
||||
|
||||
Result res = (Result) MergeUtils.merge(p1, d2, true);
|
||||
|
||||
assertEquals(d2, res);
|
||||
}
|
||||
|
||||
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
||||
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||
return OBJECT_MAPPER.readValue(json, clazz);
|
||||
}
|
||||
|
||||
}
|
|
@ -149,7 +149,7 @@ class OafMapperUtilsTest {
|
|||
void testDate() {
|
||||
final String date = GraphCleaningFunctions.cleanDate("23-FEB-1998");
|
||||
assertNotNull(date);
|
||||
System.out.println(date);
|
||||
assertEquals("1998-02-23", date);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -166,8 +166,8 @@ class OafMapperUtilsTest {
|
|||
|
||||
assertEquals(
|
||||
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
|
||||
OafMapperUtils
|
||||
.mergeResults(p1, d2)
|
||||
MergeUtils
|
||||
.mergeResult(p1, d2)
|
||||
.getResulttype()
|
||||
.getClassid());
|
||||
|
||||
|
@ -178,10 +178,10 @@ class OafMapperUtilsTest {
|
|||
|
||||
assertEquals(
|
||||
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||
OafMapperUtils
|
||||
.mergeResults(p2, d1)
|
||||
.getResulttype()
|
||||
.getClassid());
|
||||
((Result) MergeUtils
|
||||
.merge(p2, d1, true))
|
||||
.getResulttype()
|
||||
.getClassid());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -192,7 +192,7 @@ class OafMapperUtilsTest {
|
|||
assertEquals(1, d2.getCollectedfrom().size());
|
||||
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
|
||||
|
||||
Result res = OafMapperUtils.mergeResults(d1, d2);
|
||||
Result res = MergeUtils.mergeResult(d1, d2);
|
||||
|
||||
assertEquals(d2, res);
|
||||
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0000/ra.v2i3.114::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"4.65008652949e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0000/ra.v2i3.114"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0001/(aj).v3i6.458::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"4.01810569717e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0001/(aj).v3i6.458"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0001/1587::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.39172290649e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0001/1587"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0001/462::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"6.33235333753e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.36"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.00285265116e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0001/462"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0001/731::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"4.01810569717e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0001/731"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0001/ijllis.v9i4.2066.g2482::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"8.48190886761e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0001/ijllis.v9i4.2066.g2482"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0118/alfahim.v3i1.140::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"9.88840807598e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0118/alfahim.v3i1.140"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0166/fk2.stagefigshare.6442896.v3::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"7.28336930301e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0166/fk2.stagefigshare.6442896.v3"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0301/jttb.v2i1.64::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"7.28336930301e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0301/jttb.v2i1.64"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0809/seruni.v1i1.567::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"2.62959564033e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0809/seruni.v1i1.567"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0809/seruni.v2i1.765::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"9.40178571921e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0559872"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"3.67659957614e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0809/seruni.v2i1.765"}]}]}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:enrich","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"id":"unresolved::10.0901/jkip.v7i3.485::doi","instance":[{"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"6.26204125721e-09"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.0901/jkip.v7i3.485"}]}]}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}
|
|
@ -0,0 +1 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
||||
"instance": [
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
|
||||
"value": "Crossref"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2010.03.013"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
||||
"instance": [
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
|
||||
"value": "Crossref"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2010.03.013"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
|
||||
"value": "Europe PubMed Central"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
||||
"instance": [
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2010.03.013"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
|
||||
"value": "Europe PubMed Central"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "PMC21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"id": "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66",
|
||||
"instance": [
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2010.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "handle"},
|
||||
"value": "11012/83840"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::2852",
|
||||
"value": "Digital library of Brno University of Technology"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "handle"},
|
||||
"value": "11012/83840"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
||||
"instance": [
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.5281/zenodo.5121485"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
|
||||
"value": "Europe PubMed Central"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resulttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ], "isGreen": null, "openAccessColor": "gold", "isInDiamondJournal": null, "publiclyFunded": null}
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resulttype" : { "classid" : "publication" }, "isGreen": true, "openAccessColor": "gold", "isInDiamondJournal": true, "publiclyFunded": false }
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "resulttype" : { "classid" : "publication" }, "isGreen": false, "openAccessColor": null, "isInDiamondJournal": true, "publiclyFunded": false }
|
|
@ -0,0 +1,3 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resulttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ], "isGreen": null, "openAccessColor": "gold", "isInDiamondJournal": null, "publiclyFunded": null}
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resulttype" : { "classid" : "publication" }, "isGreen": true, "openAccessColor": "bronze", "isInDiamondJournal": true, "publiclyFunded": false }
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "resulttype" : { "classid" : "publication" }, "isGreen": false, "openAccessColor": null, "isInDiamondJournal": true, "publiclyFunded": false }
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"id": "50|openapc_____::000023f9cb6e3a247c764daec4273cbc",
|
||||
"resuttype": {
|
||||
"classid": "publication"
|
||||
},
|
||||
"instance": [
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|apc_________::e2b1600b229fc30663c8a1f662debddf",
|
||||
"value": "OpenAPC Global Initiative"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2010.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmid"},
|
||||
"value": "25811027"
|
||||
}
|
||||
],
|
||||
"url":["https://doi.org/10.1155/2015/439379"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
||||
"instance": [
|
||||
{
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
|
||||
"value": "Europe PubMed Central"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2010.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier":{"classid":"pmc"},
|
||||
"value":"PMC21459329"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,428 @@
|
|||
{
|
||||
"author": [
|
||||
{
|
||||
"affiliation": null,
|
||||
"fullname": "Deymier, Ghislaine",
|
||||
"name": "Ghislaine",
|
||||
"pid": [],
|
||||
"rank": 1,
|
||||
"surname": "Deymier"
|
||||
},
|
||||
{
|
||||
"affiliation": null,
|
||||
"fullname": "Gaschet, Frédéric",
|
||||
"name": "Frédéric",
|
||||
"pid": [],
|
||||
"rank": 2,
|
||||
"surname": "Gaschet"
|
||||
},
|
||||
{
|
||||
"affiliation": null,
|
||||
"fullname": "Pouyanne, Guillaume",
|
||||
"name": "Guillaume",
|
||||
"pid": [],
|
||||
"rank": 3,
|
||||
"surname": "Pouyanne"
|
||||
}
|
||||
],
|
||||
"bestaccessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
|
||||
"value" : "Crossref"
|
||||
}
|
||||
],
|
||||
"context": [],
|
||||
"contributor": [],
|
||||
"country": [],
|
||||
"coverage": [],
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"dateofacceptance": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "2013-11-30"
|
||||
},
|
||||
"dateofcollection": "2024-02-28T00:22:13+0000",
|
||||
"dateoftransformation": "2024-03-06T08:43:13.253Z",
|
||||
"description": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "For analyzing the reciprocal interaction between urban sprawl and car use, research has first focused on the link between urban density and mobility. By looking for a reduction in energy consumption, cities have favoured a compact planning development. Then reflection has broadened from the simple density to the wider, multi-dimensional concept of urban form. This controversy has led to a renewal of analysis in term of the costs of urban growth, notably by comparing the costs of \"compact\" and \"sprawled\" development. The idea is to compare the mobility costs of different urban forms. However, most often because of a lack of data, such studies are scarce. This paper suggests an innovative method to compute mobility costs at an infra-urban scale : The Spatialized Travel Account (STA). It is based on the CERTU's travel account methodology at a metropolitan scale. It puts forward an accurate estimate of the mobility costs for each transport mode (individual and public) and for each type of payer (households, firms, local authorities...). In order to test the relationships between mobility costs and urban form, we link the computed costs to morphological characteristics of infra-urban zones, taking in account sociodemographic characteristics of households."
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "L'interaction réciproque entre étalement urbain et usage de l'automobile a conduit la recherche à se focaliser sur le lien entre les densités urbaines et la mobilité. En cherchant à réduire leur consommation d'énergie pour les transports, et donc leurs émissions de Gaz à Effet de Serre, les villes ont alors cherché à planifier la \" ville compacte \", privilégiant notamment la reconstruction de la ville sur elle-même et la densification. Par la suite, la réflexion s'est élargie de la simple densité à la notion de forme urbaine et à toutes ses dimensions. Cette controverse devait conduire à un renouveau des analyses en termes de coûts de la croissance urbaine : le débat reste vif, encore aujourd'hui, sur les coûts comparés de la ville étalée et de la ville compacte. Plus largement, il s'agit d'explorer les coûts des différentes formes urbaines en termes de mobilité. Malgré cela, généralement pour des raisons de disponibilité de données, les études sur le sujet restent extrêmement rares. Cet article propose un outil novateur pour mesurer les coûts de la mobilité à l'échelle intraurbaine : le Compte Déplacements Territorialisé (CDT). Il s'inspire de la méthode développée par le CERTU pour l'établissement des Comptes Déplacements Voyageurs à l'échelle métropolitaine. Le CDT propose, pour chacune des zones de l'agglomération, une estimation précise de l'ensemble des coûts liés aux déplacements de personnes, ventilés par mode de transport (individuels et collectifs) et par type de financeurs (ménages, entreprises, collectivités territoriales, etc.). Nous proposons une application de cette méthode à la controverse sur le lien entre forme urbaine et coûts de la mobilité. Les coûts sont reliés aux caractéristiques morphologiques des zones (en termes de densité et de diversité, notamment), en prenant soin de contrôler les facteurs socio-économiques qui influent traditionnellement sur les comportements de mobilité (taille du ménage, revenu, etc.)."
|
||||
}
|
||||
],
|
||||
"eoscifguidelines": [],
|
||||
"externalReference": [],
|
||||
"extraInfo": [],
|
||||
"format": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "application/pdf"
|
||||
}
|
||||
],
|
||||
"fulltext": [],
|
||||
"id": "50|06cdd3ff4700::4826ac62a11a957fe332e2c291dcfcca",
|
||||
"instance": [
|
||||
{
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"alternateIdentifier": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "Digital Object Identifier",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.46298/cst.12132"
|
||||
}
|
||||
],
|
||||
"collectedfrom": {
|
||||
"key": "10|openaire____::6824b298c96ba906a3e6a70593affbf5",
|
||||
"value": "Episciences"
|
||||
},
|
||||
"dateofacceptance": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "2013-11-30"
|
||||
},
|
||||
"hostedby": {
|
||||
"key": "10|openaire____::6824b298c96ba906a3e6a70593affbf5",
|
||||
"value": "Episciences"
|
||||
},
|
||||
"instanceTypeMapping": [
|
||||
{
|
||||
"originalType": "http://purl.org/coar/resource_type/c_6501",
|
||||
"typeCode": "http://purl.org/coar/resource_type/c_6501",
|
||||
"typeLabel": "journal article",
|
||||
"vocabularyName": "openaire::coar_resource_types_3_1"
|
||||
},
|
||||
{
|
||||
"originalType": "http://purl.org/coar/resource_type/c_6501",
|
||||
"typeCode": "Article",
|
||||
"typeLabel": "Article",
|
||||
"vocabularyName": "openaire::user_resource_types"
|
||||
}
|
||||
],
|
||||
"instancetype": {
|
||||
"classid": "0001",
|
||||
"classname": "Article",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
},
|
||||
"license": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "CC BY NC SA"
|
||||
},
|
||||
"pid": [],
|
||||
"refereed": {
|
||||
"classid": "0002",
|
||||
"classname": "nonPeerReviewed",
|
||||
"schemeid": "dnet:review_levels",
|
||||
"schemename": "dnet:review_levels"
|
||||
},
|
||||
"url": [
|
||||
"https://doi.org/10.46298/cst.12132",
|
||||
"https://cst.episciences.org/12132"
|
||||
]
|
||||
}
|
||||
],
|
||||
"language": {
|
||||
"classid": "fra/fre",
|
||||
"classname": "French",
|
||||
"schemeid": "dnet:languages",
|
||||
"schemename": "dnet:languages"
|
||||
},
|
||||
"lastupdatetimestamp": 1710636106633,
|
||||
"metaResourceType": {
|
||||
"classid": "Research Literature",
|
||||
"classname": "Research Literature",
|
||||
"schemeid": "openaire::meta_resource_types",
|
||||
"schemename": "openaire::meta_resource_types"
|
||||
},
|
||||
"originalId": [
|
||||
"oai:episciences.org:cst:12132",
|
||||
"50|06cdd3ff4700::4826ac62a11a957fe332e2c291dcfcca"
|
||||
],
|
||||
"pid": [],
|
||||
"publisher": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "episciences.org"
|
||||
},
|
||||
"relevantdate": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "Accepted",
|
||||
"classname": "Accepted",
|
||||
"schemeid": "dnet:dataCite_date",
|
||||
"schemename": "dnet:dataCite_date"
|
||||
},
|
||||
"value": "2024-02-11"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "issued",
|
||||
"classname": "issued",
|
||||
"schemeid": "dnet:dataCite_date",
|
||||
"schemename": "dnet:dataCite_date"
|
||||
},
|
||||
"value": "2013-11-30"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "available",
|
||||
"classname": "available",
|
||||
"schemeid": "dnet:dataCite_date",
|
||||
"schemename": "dnet:dataCite_date"
|
||||
},
|
||||
"value": "2013-11-30"
|
||||
}
|
||||
],
|
||||
"resourcetype": {
|
||||
"classid": "journal article",
|
||||
"classname": "journal article",
|
||||
"schemeid": "dnet:dataCite_resource",
|
||||
"schemename": "dnet:dataCite_resource"
|
||||
},
|
||||
"resulttype": {
|
||||
"classid": "publication",
|
||||
"classname": "publication",
|
||||
"schemeid": "dnet:result_typologies",
|
||||
"schemename": "dnet:result_typologies"
|
||||
},
|
||||
"source": [],
|
||||
"subject": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "JEL: H - Public Economics/H.H7 - State and Local Government • Intergovernmental Relations/H.H7.H72 - State and Local Budget and Expenditures"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Local public finance"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "JEL: R - Urban, Rural, Regional, Real Estate, and Transportation Economics/R.R5 - Regional Government Analysis/R.R5.R51 - Finance in Urban and Rural Economies"
|
||||
}
|
||||
],
|
||||
"title": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "main title",
|
||||
"classname": "main title",
|
||||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "Urban form and the costs of daily mobility. The spatialized travel account tool and its application to the Bordeaux metropolitan area"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {
|
||||
"classid": "urn"
|
||||
},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {
|
||||
"classid": "scp-number"
|
||||
},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {
|
||||
"classid": "pmcid"
|
||||
},
|
||||
"value": "21459329"
|
||||
}
|
||||
]
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -24,7 +24,7 @@
|
|||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
|
@ -49,18 +49,16 @@
|
|||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
|
@ -85,10 +83,6 @@
|
|||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-math3</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
|
@ -107,4 +101,90 @@
|
|||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<profiles>
|
||||
<profile>
|
||||
<id>spark-24</id>
|
||||
<activation>
|
||||
<activeByDefault>true</activeByDefault>
|
||||
</activation>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>3.4.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src/main/spark-2</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>spark-34</id>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>3.4.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src/main/spark-2</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
|
||||
<profile>
|
||||
<id>spark-35</id>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
<version>3.4.0</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<sources>
|
||||
<source>src/main/spark-35</source>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</profile>
|
||||
</profiles>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -2,31 +2,41 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("keywordsclustering")
|
||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||
@ClusteringClass("legalnameclustering")
|
||||
public class LegalnameClustering extends AbstractClusteringFunction {
|
||||
|
||||
public KeywordsClustering(Map<String, Object> params) {
|
||||
private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+");
|
||||
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
|
||||
|
||||
public LegalnameClustering(Map<String, Object> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public Set<String> getRegexList(String input, Pattern codeRegex) {
|
||||
Matcher matcher = codeRegex.matcher(input);
|
||||
Set<String> cities = new HashSet<>();
|
||||
while (matcher.find()) {
|
||||
cities.add(matcher.group());
|
||||
}
|
||||
return cities;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
|
||||
// takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
|
||||
|
||||
// list of combination to return as result
|
||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||
|
||||
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
|
||||
for (String city : citiesToCodes(cities)) {
|
||||
for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) {
|
||||
for (String city : getRegexList(s, CITY_CODE_PATTERN)) {
|
||||
combinations.add(keyword + "-" + city);
|
||||
if (combinations.size() >= paramOrDefault("max", 2)) {
|
||||
return combinations;
|
||||
|
@ -42,9 +52,6 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
return fields
|
||||
.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(KeywordsClustering::cleanup)
|
||||
.map(KeywordsClustering::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
|
@ -38,7 +38,7 @@ public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction
|
|||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefixChain(cleanup(s), param("mod"));
|
||||
return suffixPrefixChain(cleanup(s), paramOrDefault("mod", 10));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
|
|
|
@ -20,7 +20,7 @@ public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
|||
return suffixPrefixChain(s, param("mod"));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
static Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
|
||||
// create the list of words from the string (remove short words)
|
||||
List<String> wordsList = Arrays
|
||||
|
@ -38,7 +38,7 @@ public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
|||
|
||||
}
|
||||
|
||||
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||
static private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||
|
||||
Set<String> set = Sets.newLinkedHashSet();
|
||||
switch (wordsList.size()) {
|
||||
|
@ -80,12 +80,16 @@ public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
|||
|
||||
}
|
||||
|
||||
private String suffix(String s, int len) {
|
||||
private static String suffix(String s, int len) {
|
||||
return s.substring(s.length() - len);
|
||||
}
|
||||
|
||||
private String prefix(String s, int len) {
|
||||
private static String prefix(String s, int len) {
|
||||
return s.substring(0, len);
|
||||
}
|
||||
|
||||
static public void main(String[] args) {
|
||||
String title = "MY LIFE AS A BOSON: THE STORY OF \"THE HIGGS\"".toLowerCase();
|
||||
System.out.println(suffixPrefixChain(title, 10));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.pace.common;
|
|||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -14,24 +13,28 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public class AbstractPaceFunctions {
|
||||
public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||
|
||||
// city map to be used when translating the city names into codes
|
||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
// keywords map to be used when translating the keyword names into codes
|
||||
private static Map<String, String> keywordMap = AbstractPaceFunctions
|
||||
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
|
||||
// country map to be used when inferring the country from the city name
|
||||
private static Map<String, String> countryMap = AbstractPaceFunctions
|
||||
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
|
||||
|
||||
// list of stopwords in different languages
|
||||
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
|
@ -41,9 +44,6 @@ public class AbstractPaceFunctions {
|
|||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
|
||||
// transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
// blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
|
||||
|
@ -51,8 +51,6 @@ public class AbstractPaceFunctions {
|
|||
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
||||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
// doi prefix for normalization
|
||||
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
||||
|
@ -84,6 +82,64 @@ public class AbstractPaceFunctions {
|
|||
return s12;
|
||||
}
|
||||
|
||||
public static String countryInference(final String original, String inferFrom) {
|
||||
if (!original.equalsIgnoreCase("unknown"))
|
||||
return original;
|
||||
|
||||
inferFrom = cleanup(inferFrom);
|
||||
inferFrom = normalize(inferFrom);
|
||||
inferFrom = filterAllStopWords(inferFrom);
|
||||
Set<String> cities = getCities(inferFrom, 4);
|
||||
return citiesToCountry(cities).stream().filter(Objects::nonNull).findFirst().orElse("UNKNOWN");
|
||||
}
|
||||
|
||||
public static String cityInference(String original) {
|
||||
original = cleanup(original);
|
||||
original = normalize(original);
|
||||
original = filterAllStopWords(original);
|
||||
|
||||
Set<String> cities = getCities(original, 4);
|
||||
|
||||
for (String city : cities) {
|
||||
original = original.replaceAll(city, cityMap.get(city));
|
||||
}
|
||||
|
||||
return original;
|
||||
}
|
||||
|
||||
public static String keywordInference(String original) {
|
||||
original = cleanup(original);
|
||||
original = normalize(original);
|
||||
original = filterAllStopWords(original);
|
||||
|
||||
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||
|
||||
for (String keyword : keywords) {
|
||||
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||
}
|
||||
|
||||
return original;
|
||||
}
|
||||
|
||||
public static String cityKeywordInference(String original) {
|
||||
original = cleanup(original);
|
||||
original = normalize(original);
|
||||
original = filterAllStopWords(original);
|
||||
|
||||
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||
Set<String> cities = getCities(original, 4);
|
||||
|
||||
for (String keyword : keywords) {
|
||||
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||
}
|
||||
|
||||
for (String city : cities) {
|
||||
original = original.replaceAll(city, cityMap.get(city));
|
||||
}
|
||||
|
||||
return original;
|
||||
}
|
||||
|
||||
protected static String fixXML(final String a) {
|
||||
|
||||
return a
|
||||
|
@ -129,25 +185,6 @@ public class AbstractPaceFunctions {
|
|||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||
});
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
} catch (Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
protected static String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
|
@ -162,23 +199,6 @@ public class AbstractPaceFunctions {
|
|||
return s != null;
|
||||
}
|
||||
|
||||
public static String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public static String utf8(final String s) {
|
||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
|
@ -233,22 +253,6 @@ public class AbstractPaceFunctions {
|
|||
return newset;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
@ -270,6 +274,30 @@ public class AbstractPaceFunctions {
|
|||
return m;
|
||||
}
|
||||
|
||||
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
// string is like this: country_code;city1;city2;city3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (int i = 1; i < line.length; i++) {
|
||||
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
|
||||
String code = cityMap.get(city);
|
||||
m.put(code, value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
return m;
|
||||
|
||||
}
|
||||
|
||||
public static String removeKeywords(String s, Set<String> keywords) {
|
||||
|
||||
s = " " + s + " ";
|
||||
|
@ -299,12 +327,12 @@ public class AbstractPaceFunctions {
|
|||
return toCodes(keywords, cityMap);
|
||||
}
|
||||
|
||||
protected static String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
public static Set<String> citiesToCountry(Set<String> cities) {
|
||||
return toCodes(toCodes(cities, cityMap), countryMap);
|
||||
}
|
||||
|
||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
protected static String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
|
||||
public static String normalizePid(String pid) {
|
||||
|
|
|
@ -47,9 +47,37 @@ public class FieldDef implements Serializable {
|
|||
|
||||
private String clean;
|
||||
|
||||
private String infer;
|
||||
|
||||
private String inferenceFrom;
|
||||
|
||||
public FieldDef() {
|
||||
}
|
||||
|
||||
public FieldDef clone() {
|
||||
FieldDef fieldDef = new FieldDef();
|
||||
fieldDef.setName(this.name);
|
||||
fieldDef.setPath(this.path);
|
||||
fieldDef.setType(this.type);
|
||||
fieldDef.setOverrideMatch(this.overrideMatch);
|
||||
fieldDef.setSize(this.size);
|
||||
fieldDef.setLength(this.length);
|
||||
fieldDef.setFilter(this.filter);
|
||||
fieldDef.setSorted(this.sorted);
|
||||
fieldDef.setClean(this.clean);
|
||||
fieldDef.setInfer(this.infer);
|
||||
fieldDef.setInferenceFrom(this.inferenceFrom);
|
||||
return fieldDef;
|
||||
}
|
||||
|
||||
public String getInferenceFrom() {
|
||||
return inferenceFrom;
|
||||
}
|
||||
|
||||
public void setInferenceFrom(final String inferenceFrom) {
|
||||
this.inferenceFrom = inferenceFrom;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
@ -126,6 +154,14 @@ public class FieldDef implements Serializable {
|
|||
this.clean = clean;
|
||||
}
|
||||
|
||||
public String getInfer() {
|
||||
return infer;
|
||||
}
|
||||
|
||||
public void setInfer(String infer) {
|
||||
this.infer = infer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
|
|
|
@ -19,48 +19,10 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
|
|||
val model: SparkModel = SparkModel(conf)
|
||||
|
||||
val dedup: (Dataset[Row] => Dataset[Row]) = df => {
|
||||
df.transform(filterAndCleanup)
|
||||
.transform(generateClustersWithCollect)
|
||||
df.transform(generateClustersWithCollect)
|
||||
.transform(processBlocks)
|
||||
}
|
||||
|
||||
|
||||
val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
|
||||
val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
|
||||
if (conf.blacklists.containsKey(fdef.getName)) {
|
||||
res.withColumn(
|
||||
fdef.getName + "_filtered",
|
||||
filterColumnUDF(fdef).apply(new Column(fdef.getName))
|
||||
)
|
||||
} else {
|
||||
res
|
||||
}
|
||||
})
|
||||
|
||||
df_with_filters
|
||||
}
|
||||
|
||||
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
|
||||
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
|
||||
|
||||
if (blacklist == null) {
|
||||
throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
|
||||
} else {
|
||||
fdef.getType match {
|
||||
case Type.List | Type.JSON =>
|
||||
udf[Array[String], Array[String]](values => {
|
||||
values.filter((v: String) => !blacklist.test(v))
|
||||
})
|
||||
|
||||
case _ =>
|
||||
udf[String, String](v => {
|
||||
if (blacklist.test(v)) ""
|
||||
else v
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
|
||||
var df_with_clustering_keys: Dataset[Row] = null
|
||||
|
||||
|
|
|
@ -3,14 +3,14 @@ package eu.dnetlib.pace.model
|
|||
import com.jayway.jsonpath.{Configuration, JsonPath}
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil
|
||||
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
import org.apache.spark.sql.{Dataset, Row}
|
||||
|
||||
import java.util.Locale
|
||||
import java.util.function.Predicate
|
||||
import java.util.regex.Pattern
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
@ -29,8 +29,20 @@ case class SparkModel(conf: DedupConfig) {
|
|||
identifier.setName(identifierFieldName)
|
||||
identifier.setType(Type.String)
|
||||
|
||||
// create fields for blacklist
|
||||
val filtered = conf.getPace.getModel.asScala.flatMap(fdef => {
|
||||
if (conf.blacklists().containsKey(fdef.getName)) {
|
||||
val fdef_filtered = fdef.clone()
|
||||
fdef_filtered.setName(fdef.getName + "_filtered")
|
||||
Seq(fdef, fdef_filtered)
|
||||
}
|
||||
else {
|
||||
Seq(fdef)
|
||||
}
|
||||
})
|
||||
|
||||
// Construct a Spark StructType representing the schema of the model
|
||||
(Seq(identifier) ++ conf.getPace.getModel.asScala)
|
||||
(Seq(identifier) ++ filtered)
|
||||
.foldLeft(
|
||||
new StructType()
|
||||
)((resType, fieldDef) => {
|
||||
|
@ -44,7 +56,6 @@ case class SparkModel(conf: DedupConfig) {
|
|||
})
|
||||
})
|
||||
|
||||
|
||||
}
|
||||
|
||||
val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
|
||||
|
@ -52,7 +63,8 @@ case class SparkModel(conf: DedupConfig) {
|
|||
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||
|
||||
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||
df.map(r => rowFromJson(r))(RowEncoder(schema))
|
||||
df
|
||||
.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||
}
|
||||
|
||||
def rowFromJson(json: String): Row = {
|
||||
|
@ -64,41 +76,63 @@ case class SparkModel(conf: DedupConfig) {
|
|||
|
||||
schema.fieldNames.zipWithIndex.foldLeft(values) {
|
||||
case ((res, (fname, index))) =>
|
||||
val fdef = conf.getPace.getModelMap.get(fname)
|
||||
|
||||
val fdef = conf.getPace.getModelMap.get(fname.split("_filtered")(0))
|
||||
|
||||
if (fdef != null) {
|
||||
res(index) = fdef.getType match {
|
||||
case Type.String | Type.Int =>
|
||||
MapDocumentUtil.truncateValue(
|
||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
if (!fname.contains("_filtered")) { //process fields with no blacklist
|
||||
res(index) = fdef.getType match {
|
||||
case Type.String | Type.Int =>
|
||||
MapDocumentUtil.truncateValue(
|
||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
|
||||
case Type.URL =>
|
||||
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
||||
if (!URL_REGEX.matcher(uv).matches)
|
||||
uv = ""
|
||||
uv
|
||||
case Type.URL =>
|
||||
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
||||
if (!URL_REGEX.matcher(uv).matches)
|
||||
uv = ""
|
||||
uv
|
||||
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
).asScala
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
).asScala
|
||||
|
||||
case Type.StringConcat =>
|
||||
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||
case Type.StringConcat =>
|
||||
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||
|
||||
MapDocumentUtil.truncateValue(
|
||||
jpaths
|
||||
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
||||
.mkString(" "),
|
||||
fdef.getLength
|
||||
)
|
||||
MapDocumentUtil.truncateValue(
|
||||
jpaths
|
||||
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
||||
.mkString(" "),
|
||||
fdef.getLength
|
||||
)
|
||||
|
||||
case Type.DoubleArray =>
|
||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||
case Type.DoubleArray =>
|
||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||
}
|
||||
}
|
||||
else { //process fields with blacklist
|
||||
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
|
||||
|
||||
res(index) = fdef.getType match {
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
).asScala.filter((v: String) => !blacklist.test(v))
|
||||
|
||||
case _ =>
|
||||
val value: String = MapDocumentUtil.truncateValue(
|
||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
if (blacklist.test(value)) "" else value
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
val filter = fdef.getFilter
|
||||
|
||||
|
@ -123,12 +157,22 @@ case class SparkModel(conf: DedupConfig) {
|
|||
case _ => res(index)
|
||||
}
|
||||
}
|
||||
|
||||
if (StringUtils.isNotBlank(fdef.getInfer)) {
|
||||
val inferFrom: String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
||||
res(index) = res(index) match {
|
||||
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
|
||||
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
|
||||
}
|
||||
|
||||
new GenericRowWithSchema(values, schema)
|
||||
|
||||
}
|
||||
|
||||
def clean(value: String, cleantype: String) : String = {
|
||||
|
@ -146,5 +190,17 @@ case class SparkModel(conf: DedupConfig) {
|
|||
res
|
||||
}
|
||||
|
||||
def inference(value: String, inferfrom: String, infertype: String) : String = {
|
||||
val res = infertype match {
|
||||
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
|
||||
case "city" => AbstractPaceFunctions.cityInference(value)
|
||||
case "keyword" => AbstractPaceFunctions.keywordInference(value)
|
||||
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
|
||||
case _ => value
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
@ -11,6 +13,7 @@ import eu.dnetlib.pace.config.Config;
|
|||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.util.AuthorMatchers;
|
||||
|
||||
@ComparatorClass("authorsMatch")
|
||||
public class AuthorsMatch extends AbstractListComparator {
|
||||
|
@ -41,24 +44,36 @@ public class AuthorsMatch extends AbstractListComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
public double compare(final List<String> left, final List<String> right, final Config conf) {
|
||||
if (left.isEmpty() || right.isEmpty())
|
||||
return -1;
|
||||
|
||||
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
||||
if (left.size() > SIZE_THRESHOLD || right.size() > SIZE_THRESHOLD)
|
||||
return 1.0;
|
||||
|
||||
int maxMiss = Integer.MAX_VALUE;
|
||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
|
||||
Double threshold = getDoubleParam("threshold");
|
||||
int maxMiss = Integer.MAX_VALUE;
|
||||
|
||||
if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) {
|
||||
maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size()));
|
||||
if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && left.size() == right.size()) {
|
||||
maxMiss = (int) Math.floor((1 - threshold) * Math.max(left.size(), right.size()));
|
||||
}
|
||||
|
||||
int common = 0;
|
||||
|
||||
List<String> a = new ArrayList<>(left);
|
||||
List<String> b = new ArrayList<>(right);
|
||||
|
||||
common += AuthorMatchers
|
||||
.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchEqualsIgnoreCase)
|
||||
.size() / 2;
|
||||
common += AuthorMatchers
|
||||
.removeMatches(a, b, (BiFunction<String, String, Object>) AuthorMatchers::matchOrderedTokenAndAbbreviations)
|
||||
.size() / 2;
|
||||
|
||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
|
||||
// compare each element of List1 with each element of List2
|
||||
int alreadyMatched = common;
|
||||
for (int i = 0; i < a.size(); i++) {
|
||||
Person p1 = new Person(a.get(i), false);
|
||||
|
||||
|
@ -123,13 +138,13 @@ public class AuthorsMatch extends AbstractListComparator {
|
|||
}
|
||||
}
|
||||
|
||||
if (i - common > maxMiss) {
|
||||
if (i - common - alreadyMatched > maxMiss) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// normalization factor to compute the score
|
||||
int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common);
|
||||
int normFactor = left.size() == right.size() ? left.size() : (left.size() + right.size() - common);
|
||||
|
||||
if (TYPE.equals("percentage")) {
|
||||
return (double) common / normFactor;
|
||||
|
@ -160,5 +175,4 @@ public class AuthorsMatch extends AbstractListComparator {
|
|||
public String normalization(String s) {
|
||||
return normalize(utf8(cleanup(s)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,48 +0,0 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("cityMatch")
|
||||
public class CityMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public CityMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = citiesToCodes(cities1);
|
||||
Set<String> codes2 = citiesToCodes(cities2);
|
||||
|
||||
// if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; // undefined if one of the two has no cities
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("codeMatch")
|
||||
public class CodeMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
private Pattern CODE_REGEX;
|
||||
|
||||
public CodeMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]+::\\d+"));
|
||||
}
|
||||
|
||||
public Set<String> getRegexList(String input) {
|
||||
Matcher matcher = this.CODE_REGEX.matcher(input);
|
||||
Set<String> cities = new HashSet<>();
|
||||
while (matcher.find()) {
|
||||
cities.add(matcher.group());
|
||||
}
|
||||
return cities;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
Set<String> codes1 = getRegexList(a);
|
||||
Set<String> codes2 = getRegexList(b);
|
||||
|
||||
// if no codes are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; // undefined if one of the two has no codes
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("countryMatch")
|
||||
public class CountryMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public CountryMatch(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public CountryMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; // return -1 if a field is missing
|
||||
}
|
||||
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
|
||||
return -1.0; // return -1 if a country is UNKNOWN
|
||||
}
|
||||
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.time.DateTimeException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.Period;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("dateRange")
|
||||
public class DateRange extends AbstractStringComparator {
|
||||
|
||||
int YEAR_RANGE;
|
||||
|
||||
public DateRange(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3"));
|
||||
}
|
||||
|
||||
public DateRange(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected DateRange(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
public static boolean isNumeric(String str) {
|
||||
return str.matches("\\d+"); // match a number with optional '-' and decimal.
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; // return -1 if a field is missing
|
||||
}
|
||||
|
||||
try {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
|
||||
LocalDate d1 = LocalDate.parse(a, formatter);
|
||||
LocalDate d2 = LocalDate.parse(b, formatter);
|
||||
Period period = Period.between(d1, d2);
|
||||
|
||||
return period.getYears() <= YEAR_RANGE ? 1.0 : 0.0;
|
||||
} catch (DateTimeException e) {
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -23,15 +23,18 @@ public class InstanceTypeMatch extends AbstractListComparator {
|
|||
|
||||
// jolly types
|
||||
translationMap.put("Conference object", "*");
|
||||
translationMap.put("Research", "*");
|
||||
translationMap.put("Other literature type", "*");
|
||||
translationMap.put("Unknown", "*");
|
||||
translationMap.put("UNKNOWN", "*");
|
||||
|
||||
// article types
|
||||
translationMap.put("Article", "Article");
|
||||
translationMap.put("Journal", "Article");
|
||||
translationMap.put("Data Paper", "Article");
|
||||
translationMap.put("Software Paper", "Article");
|
||||
translationMap.put("Preprint", "Article");
|
||||
translationMap.put("Part of book or chapter of book", "Article");
|
||||
|
||||
// thesis types
|
||||
translationMap.put("Thesis", "Thesis");
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("jaroWinklerLegalname")
|
||||
public class JaroWinklerLegalname extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
private final String CITY_CODE_REGEX = "city::\\d+";
|
||||
private final String KEYWORD_CODE_REGEX = "key::\\d+";
|
||||
|
||||
public JaroWinklerLegalname(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerLegalname(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
|
||||
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||
|
||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("jaroWinklerNormalizedName")
|
||||
public class JaroWinklerNormalizedName extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(
|
||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(
|
||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
|
||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -41,21 +41,38 @@ public class JsonListMatch extends AbstractListComparator {
|
|||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
switch (MODE) {
|
||||
case "count":
|
||||
return Sets.intersection(ca, cb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
case "percentage":
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
|
||||
case "type":
|
||||
Set<String> typesA = ca.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||
Set<String> typesB = cb.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||
|
||||
Set<String> types = Sets.intersection(typesA, typesB);
|
||||
|
||||
if (types.isEmpty()) // if no common type, it is impossible to compare
|
||||
return -1;
|
||||
|
||||
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||
cb = cb.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||
|
||||
return (double) Sets.intersection(ca, cb).size() / types.size();
|
||||
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (MODE.equals("percentage"))
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
|
||||
}
|
||||
|
||||
// converts every json into a comparable string basing on parameters
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("keywordMatch")
|
||||
public class KeywordMatch extends AbstractStringComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
|
||||
public KeywordMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(
|
||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(
|
||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
||||
|
||||
// if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1.0; // undefined if one of the two has no keywords
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
package eu.dnetlib.pace.util
|
||||
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
import scala.util.control.Breaks.{break, breakable}
|
||||
|
||||
object AuthorMatchers {
|
||||
val SPLIT_REGEX = Pattern.compile("[\\s,\\.]+")
|
||||
|
||||
val WORD_DIFF = 2
|
||||
|
||||
def matchEqualsIgnoreCase(a1: String, a2: String): Boolean = {
|
||||
if (a1 == null || a2 == null)
|
||||
false
|
||||
else
|
||||
a1 == a2 || a1.toLowerCase(Locale.ROOT).equals(a2.toLowerCase(Locale.ROOT))
|
||||
}
|
||||
|
||||
def matchOtherNames(fullName: String, otherNames: Seq[String]): Boolean = {
|
||||
if (otherNames != null) {
|
||||
otherNames.exists(matchEqualsIgnoreCase(fullName, _))
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
def matchOrderedTokenAndAbbreviations(a1: String, a2: String): Boolean = {
|
||||
val p1: Array[String] = SPLIT_REGEX.split(a1.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
||||
val p2: Array[String] = SPLIT_REGEX.split(a2.trim.toLowerCase(Locale.ROOT)).filter(_.nonEmpty).sorted
|
||||
|
||||
if (p1.length < 2 || p2.length < 2) return false
|
||||
if (Math.abs(p1.length - p2.length) > WORD_DIFF) return false // use alternative comparison algo
|
||||
|
||||
var p1Idx: Int = 0
|
||||
var p2Idx: Int = 0
|
||||
var shortMatches: Int = 0
|
||||
var longMatches: Int = 0
|
||||
while (p1Idx < p1.length && p2Idx < p2.length) {
|
||||
val e1: String = p1(p1Idx)
|
||||
val c1: Char = e1.charAt(0)
|
||||
val e2: String = p2(p2Idx)
|
||||
val c2: Char = e2.charAt(0)
|
||||
if (c1 < c2) p1Idx += 1
|
||||
else if (c1 > c2) p2Idx += 1
|
||||
else {
|
||||
var res: Boolean = false
|
||||
if (e1.length != 1 && e2.length != 1) {
|
||||
res = e1 == e2
|
||||
if (res)
|
||||
longMatches += 1
|
||||
} else {
|
||||
res = true
|
||||
shortMatches += 1
|
||||
}
|
||||
if (res) {
|
||||
p1Idx += 1
|
||||
p2Idx += 1
|
||||
} else {
|
||||
val diff: Int = e1.compareTo(e2)
|
||||
if (diff < 0) p1Idx += 1
|
||||
else if (diff > 0) p2Idx += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
longMatches > 0 && (shortMatches + longMatches) == Math.min(p1.length, p2.length)
|
||||
}
|
||||
|
||||
def removeMatches(
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: java.util.function.BiFunction[String,String,Boolean]
|
||||
) : java.util.List[String] = {
|
||||
removeMatches(graph_authors, orcid_authors, (a, b) => matchingFunc(a,b))
|
||||
}
|
||||
|
||||
|
||||
def removeMatches(
|
||||
graph_authors: java.util.List[String],
|
||||
orcid_authors: java.util.List[String],
|
||||
matchingFunc: (String, String) => Boolean
|
||||
) : java.util.List[String] = {
|
||||
val matched = new java.util.ArrayList[String]()
|
||||
|
||||
if (graph_authors != null && !graph_authors.isEmpty) {
|
||||
val ait = graph_authors.iterator
|
||||
|
||||
while (ait.hasNext) {
|
||||
val author = ait.next()
|
||||
val oit = orcid_authors.iterator
|
||||
|
||||
breakable {
|
||||
while (oit.hasNext) {
|
||||
val orcid = oit.next()
|
||||
|
||||
if (matchingFunc(author, orcid)) {
|
||||
ait.remove()
|
||||
oit.remove()
|
||||
|
||||
matched.add(author)
|
||||
matched.add(orcid)
|
||||
|
||||
break()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matched
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,12 @@
|
|||
package eu.dnetlib.pace.util
|
||||
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
|
||||
import org.apache.spark.sql.types.StructType
|
||||
|
||||
object SparkCompatUtils {
|
||||
|
||||
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||
RowEncoder(schema)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
package eu.dnetlib.pace.util
|
||||
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||
import org.apache.spark.sql.types.StructType
|
||||
|
||||
object SparkCompatUtils {
|
||||
|
||||
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||
ExpressionEncoder(schema)
|
||||
}
|
||||
}
|
|
@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.mongodb.connection.Cluster;
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
@ -177,41 +178,16 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testKeywordsClustering() {
|
||||
public void legalnameClustering() {
|
||||
|
||||
final ClusteringFunction cf = new KeywordsClustering(params);
|
||||
final String s = "Polytechnic University of Turin";
|
||||
final ClusteringFunction cf = new LegalnameClustering(params);
|
||||
String s = "key::1 key::2 city::1";
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||
|
||||
final String s1 = "POLITECNICO DI TORINO";
|
||||
System.out.println(s1);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
|
||||
|
||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||
System.out.println("s2 = " + s2);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
|
||||
|
||||
final String s3 = "universita universita milano milano";
|
||||
System.out.println("s3 = " + s3);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
|
||||
|
||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
||||
System.out.println("s4 = " + s4);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
|
||||
|
||||
final String s5 = "İstanbul Ticarət Universiteti";
|
||||
System.out.println("s5 = " + s5);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
|
||||
|
||||
final String s6 = "National and Kapodistrian University of Athens";
|
||||
System.out.println("s6 = " + s6);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
|
||||
|
||||
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
|
||||
System.out.println("s7 = " + s7);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
|
||||
|
||||
s = "key::1 key::2 city::1 city::2";
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -251,4 +227,17 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNumAuthorsTitleSuffixPrefixChain() {
|
||||
|
||||
final ClusteringFunction cf = new NumAuthorsTitleSuffixPrefixChain(params);
|
||||
params.put("mod", 10);
|
||||
|
||||
final String title = "PARP-2 Regulates SIRT1 Expression and Whole-Body Energy Expenditure";
|
||||
final String num_authors = "10";
|
||||
System.out.println("title = " + title);
|
||||
System.out.println("num_authors = " + num_authors);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(num_authors, title)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
|
@ -54,4 +53,56 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
|||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test()
|
||||
public void countryInferenceTest_NPE() {
|
||||
assertThrows(
|
||||
NullPointerException.class,
|
||||
() -> countryInference("UNKNOWN", null),
|
||||
"Expected countryInference() to throw an NPE");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void countryInferenceTest() {
|
||||
assertEquals("UNKNOWN", countryInference("UNKNOWN", ""));
|
||||
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
|
||||
assertEquals("UK", countryInference("UK", "Università di Bologna"));
|
||||
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
|
||||
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cityInferenceTest() {
|
||||
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
|
||||
assertEquals("university city::3170647", cityInference("University of Pisa"));
|
||||
assertEquals("universita", cityInference("Università del lavoro"));
|
||||
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordInferenceTest() {
|
||||
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
|
||||
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
|
||||
assertEquals(
|
||||
"key::1 key::60 key::81 milano bergamo",
|
||||
keywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
|
||||
assertEquals(
|
||||
"key::10 kapodistriako panepistemio athenon",
|
||||
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cityKeywordInferenceTest() {
|
||||
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
|
||||
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
|
||||
assertEquals(
|
||||
"key::1 key::60 key::81 city::3173435 city::3182164",
|
||||
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||
assertEquals(
|
||||
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
|
||||
assertEquals(
|
||||
"key::10 kapodistriako panepistemio city::264371",
|
||||
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
params.put("name_th", "0.95");
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
params.put("codeRegex", "key::\\d+");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -44,53 +45,61 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void cityMatchTest() {
|
||||
final CityMatch cityMatch = new CityMatch(params);
|
||||
public void codeMatchTest() {
|
||||
CodeMatch codeMatch = new CodeMatch(params);
|
||||
|
||||
// both names with no cities
|
||||
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
|
||||
// both names with no codes
|
||||
assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf));
|
||||
|
||||
// one of the two names with no cities
|
||||
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
|
||||
// one of the two names with no codes
|
||||
assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf));
|
||||
|
||||
// both names with cities (same)
|
||||
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
|
||||
// both names with codes (same)
|
||||
assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf));
|
||||
|
||||
// both names with cities (different)
|
||||
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
|
||||
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
|
||||
// both names with codes (different)
|
||||
assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf));
|
||||
|
||||
// particular cases
|
||||
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
||||
assertEquals(
|
||||
1.0,
|
||||
cityMatch
|
||||
.distance(
|
||||
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
|
||||
conf));
|
||||
// both names with codes (1 same, 1 different)
|
||||
assertEquals(0.5, codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf));
|
||||
|
||||
// failing becasuse 'Allen' is a transliterrated greek stopword
|
||||
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordMatchTest() {
|
||||
params.put("threshold", "0.5");
|
||||
public void datasetVersionCodeMatchTest() {
|
||||
|
||||
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
||||
params.put("codeRegex", "(?=[\\w-]*[a-zA-Z])(?=[\\w-]*\\d)[\\w-]+");
|
||||
CodeMatch codeMatch = new CodeMatch(params);
|
||||
|
||||
// names have different codes
|
||||
assertEquals(
|
||||
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
||||
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
||||
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
|
||||
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
||||
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||
0.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998 ev02a",
|
||||
"physical oceanography at ctd station june 1998 ir02", conf));
|
||||
|
||||
// names have same code
|
||||
assertEquals(
|
||||
1.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998 ev02a",
|
||||
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||
|
||||
// code is not in both names
|
||||
assertEquals(
|
||||
-1,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998",
|
||||
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||
assertEquals(
|
||||
1.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998",
|
||||
conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -155,15 +164,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void jaroWinklerNormalizedNameTest() {
|
||||
public void jaroWinklerLegalnameTest() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName
|
||||
.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||
double result = jaroWinklerLegalname
|
||||
.distance("AT&T (United States)", "United States key::2 key::1", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||
result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
}
|
||||
|
@ -285,15 +294,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
List<String> a = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
|
||||
"authors");
|
||||
List<String> b = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_2\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"isni\",\"classname\":\"ISNI Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"isni_1\"}"),
|
||||
"authors");
|
||||
|
||||
double result = jsonListMatch.compare(a, b, conf);
|
||||
|
@ -305,6 +314,13 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
result = jsonListMatch.compare(a, b, conf);
|
||||
|
||||
assertEquals(1.0, result);
|
||||
|
||||
params.put("mode", "type");
|
||||
jsonListMatch = new JsonListMatch(params);
|
||||
result = jsonListMatch.compare(a, b, conf);
|
||||
|
||||
assertEquals(0.5, result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -336,4 +352,53 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
System.out.println("compare = " + compare);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void countryMatch() {
|
||||
|
||||
CountryMatch countryMatch = new CountryMatch(params);
|
||||
|
||||
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = countryMatch.distance("CL", "UNKNOWN", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = countryMatch.distance("CL", "IT", conf);
|
||||
assertEquals(0.0, result);
|
||||
|
||||
result = countryMatch.distance("CL", "CL", conf);
|
||||
assertEquals(1.0, result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void dateMatch() {
|
||||
|
||||
DateRange dateRange = new DateRange(params);
|
||||
|
||||
double result = dateRange.distance("2021-05-13", "2023-05-13", conf);
|
||||
assertEquals(1.0, result);
|
||||
|
||||
result = dateRange.distance("2021-05-13", "2025-05-13", conf);
|
||||
assertEquals(0.0, result);
|
||||
|
||||
result = dateRange.distance("", "2020-05-05", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = dateRange.distance("invalid date", "2021-05-02", conf);
|
||||
assertEquals(-1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void titleVersionMatchTest() {
|
||||
|
||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||
|
||||
double result = titleVersionMatch
|
||||
.compare(
|
||||
"parp 2 regulates sirt 1 expression and whole body energy expenditure",
|
||||
"parp 2 regulates sirt 1 expression and whole body energy expenditure", conf);
|
||||
assertEquals(1.0, result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-shade-package</artifactId>
|
||||
<description>This module create a jar of all module dependencies</description>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<transformers>
|
||||
<transformer>
|
||||
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
|
||||
</transformer>
|
||||
<transformer />
|
||||
<transformer>
|
||||
<resource>META-INF/cxf/bus-extensions.txt</resource>
|
||||
</transformer>
|
||||
</transformers>
|
||||
<filters>
|
||||
<filter>
|
||||
<artifact>*:*</artifact>
|
||||
<excludes>
|
||||
<exclude>META-INF/maven/**</exclude>
|
||||
<exclude>META-INF/*.SF</exclude>
|
||||
<exclude>META-INF/*.DSA</exclude>
|
||||
<exclude>META-INF/*.RSA</exclude>
|
||||
</excludes>
|
||||
</filter>
|
||||
</filters>
|
||||
<relocations>
|
||||
<relocation>
|
||||
<pattern>com</pattern>
|
||||
<shadedPattern>repackaged.com.google.common</shadedPattern>
|
||||
<includes>
|
||||
<include>com.google.common.**</include>
|
||||
</includes>
|
||||
</relocation>
|
||||
</relocations>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.28</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<version>5.6.1</version>
|
||||
<scope>test</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>junit-jupiter-params</artifactId>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-core</artifactId>
|
||||
<version>3.3.3</version>
|
||||
<scope>test</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>byte-buddy</artifactId>
|
||||
<groupId>net.bytebuddy</groupId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<artifactId>byte-buddy-agent</artifactId>
|
||||
<groupId>net.bytebuddy</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-junit-jupiter</artifactId>
|
||||
<version>3.3.3</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
</project>
|
|
@ -0,0 +1,169 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.5-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-shade-package</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<description>This module create a jar of all module dependencies</description>
|
||||
|
||||
|
||||
<dependencies>
|
||||
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-actionmanager</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-blacklist</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-broker-events</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-mapper</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-provision</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-impact-indicators</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-actionsets</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-hist-snaps</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-monitor-irish</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-promote</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-swh</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-raw-data-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-stats-build</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
</dependencies>
|
||||
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<transformers>
|
||||
<transformer
|
||||
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
|
||||
</transformer>
|
||||
<!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
|
||||
<transformer
|
||||
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
||||
<transformer
|
||||
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
||||
<resource>META-INF/cxf/bus-extensions.txt</resource>
|
||||
</transformer>
|
||||
</transformers>
|
||||
<filters>
|
||||
<filter>
|
||||
<artifact>*:*</artifact>
|
||||
<excludes>
|
||||
<exclude>META-INF/maven/**</exclude>
|
||||
<exclude>META-INF/*.SF</exclude>
|
||||
<exclude>META-INF/*.DSA</exclude>
|
||||
<exclude>META-INF/*.RSA</exclude>
|
||||
</excludes>
|
||||
</filter>
|
||||
</filters>
|
||||
<relocations>
|
||||
<relocation>
|
||||
<pattern>com</pattern>
|
||||
<shadedPattern>repackaged.com.google.common</shadedPattern>
|
||||
<includes>
|
||||
<include>com.google.common.**</include>
|
||||
</includes>
|
||||
</relocation>
|
||||
</relocations>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
|
@ -51,48 +51,5 @@
|
|||
<artifactId>hadoop-distcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-common</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon-dom</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>jgrapht</groupId>
|
||||
<artifactId>jgrapht</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>net.sf.ehcache</groupId>
|
||||
<artifactId>ehcache</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-test</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.*</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>apache</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.dhp.actionmanager;
|
|||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -22,7 +21,6 @@ import com.google.common.base.Splitter;
|
|||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -65,7 +63,7 @@ public class ISClient implements Serializable {
|
|||
.map(t -> buildDirectory(basePath, t))
|
||||
.collect(Collectors.toList()))
|
||||
.orElseThrow(() -> new IllegalStateException("empty set list"));
|
||||
} catch (ActionManagerException | ISLookUpException e) {
|
||||
} catch (ISLookUpException e) {
|
||||
throw new IllegalStateException("unable to query ActionSets info from the IS");
|
||||
}
|
||||
}
|
||||
|
@ -89,31 +87,18 @@ public class ISClient implements Serializable {
|
|||
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
|
||||
}
|
||||
|
||||
private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException {
|
||||
private String getBasePathHDFS(ISLookUpService isLookup) throws ISLookUpException {
|
||||
return queryServiceProperty(isLookup, "basePath");
|
||||
}
|
||||
|
||||
private String queryServiceProperty(ISLookUpService isLookup, final String propertyName)
|
||||
throws ActionManagerException {
|
||||
throws ISLookUpException {
|
||||
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
|
||||
+ propertyName
|
||||
+ "']/@value/string()";
|
||||
log.debug("quering for service property: {}", q);
|
||||
try {
|
||||
final List<String> value = isLookup.quickSearchProfile(q);
|
||||
return Iterables.getOnlyElement(value);
|
||||
} catch (ISLookUpException e) {
|
||||
String msg = "Error accessing service profile, using query: " + q;
|
||||
log.error(msg, e);
|
||||
throw new ActionManagerException(msg, e);
|
||||
} catch (NoSuchElementException e) {
|
||||
String msg = "missing service property: " + propertyName;
|
||||
log.error(msg, e);
|
||||
throw new ActionManagerException(msg, e);
|
||||
} catch (IllegalArgumentException e) {
|
||||
String msg = "found more than one service property: " + propertyName;
|
||||
log.error(msg, e);
|
||||
throw new ActionManagerException(msg, e);
|
||||
}
|
||||
|
||||
final List<String> value = isLookup.quickSearchProfile(q);
|
||||
return Iterables.getOnlyElement(value);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,8 +7,7 @@ import java.util.function.BiFunction;
|
|||
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||
|
||||
/** OAF model merging support. */
|
||||
public class MergeAndGet {
|
||||
|
@ -46,20 +45,7 @@ public class MergeAndGet {
|
|||
}
|
||||
|
||||
private static <G extends Oaf, A extends Oaf> G mergeFromAndGet(G x, A y) {
|
||||
if (isSubClass(x, Relation.class) && isSubClass(y, Relation.class)) {
|
||||
((Relation) x).mergeFrom((Relation) y);
|
||||
return x;
|
||||
} else if (isSubClass(x, OafEntity.class)
|
||||
&& isSubClass(y, OafEntity.class)
|
||||
&& isSubClass(x, y)) {
|
||||
((OafEntity) x).mergeFrom((OafEntity) y);
|
||||
return x;
|
||||
}
|
||||
throw new RuntimeException(
|
||||
String
|
||||
.format(
|
||||
"MERGE_FROM_AND_GET incompatible types: %s, %s",
|
||||
x.getClass().getCanonicalName(), y.getClass().getCanonicalName()));
|
||||
return (G) MergeUtils.merge(x, y);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright (c) 2024.
|
||||
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*/
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.promote;
|
||||
|
||||
/** Encodes the Actionset promotion strategies */
|
||||
public class PromoteAction {
|
||||
|
||||
/** The supported actionset promotion strategies
|
||||
*
|
||||
* ENRICH: promotes only records in the actionset matching another record in the
|
||||
* graph and enriches them applying the given MergeAndGet strategy
|
||||
* UPSERT: promotes all the records in an actionset, matching records are updated
|
||||
* using the given MergeAndGet strategy, the non-matching record as inserted as they are.
|
||||
*/
|
||||
public enum Strategy {
|
||||
ENRICH, UPSERT
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the string representation of the join type implementing the given PromoteAction.
|
||||
*
|
||||
* @param strategy the strategy to be used to promote the Actionset contents
|
||||
* @return the join type used to implement the promotion strategy
|
||||
*/
|
||||
public static String joinTypeForStrategy(PromoteAction.Strategy strategy) {
|
||||
switch (strategy) {
|
||||
case ENRICH:
|
||||
return "left_outer";
|
||||
case UPSERT:
|
||||
return "full_outer";
|
||||
default:
|
||||
throw new IllegalStateException("unsupported PromoteAction: " + strategy.toString());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -67,8 +67,9 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
String outputGraphTablePath = parser.get("outputGraphTablePath");
|
||||
logger.info("outputGraphTablePath: {}", outputGraphTablePath);
|
||||
|
||||
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
|
||||
logger.info("strategy: {}", strategy);
|
||||
MergeAndGet.Strategy mergeAndGetStrategy = MergeAndGet.Strategy
|
||||
.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
|
||||
logger.info("mergeAndGetStrategy: {}", mergeAndGetStrategy);
|
||||
|
||||
Boolean shouldGroupById = Optional
|
||||
.ofNullable(parser.get("shouldGroupById"))
|
||||
|
@ -76,6 +77,12 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
.orElse(true);
|
||||
logger.info("shouldGroupById: {}", shouldGroupById);
|
||||
|
||||
PromoteAction.Strategy promoteActionStrategy = Optional
|
||||
.ofNullable(parser.get("promoteActionStrategy"))
|
||||
.map(PromoteAction.Strategy::valueOf)
|
||||
.orElse(PromoteAction.Strategy.UPSERT);
|
||||
logger.info("promoteActionStrategy: {}", promoteActionStrategy);
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
|
||||
@SuppressWarnings("unchecked")
|
||||
|
@ -97,7 +104,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
inputGraphTablePath,
|
||||
inputActionPayloadPath,
|
||||
outputGraphTablePath,
|
||||
strategy,
|
||||
mergeAndGetStrategy,
|
||||
promoteActionStrategy,
|
||||
rowClazz,
|
||||
actionPayloadClazz,
|
||||
shouldGroupById);
|
||||
|
@ -124,14 +132,16 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
String inputGraphTablePath,
|
||||
String inputActionPayloadPath,
|
||||
String outputGraphTablePath,
|
||||
MergeAndGet.Strategy strategy,
|
||||
MergeAndGet.Strategy mergeAndGetStrategy,
|
||||
PromoteAction.Strategy promoteActionStrategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz, Boolean shouldGroupById) {
|
||||
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
|
||||
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
|
||||
|
||||
Dataset<G> result = promoteActionPayloadForGraphTable(
|
||||
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
|
||||
rowDS, actionPayloadDS, mergeAndGetStrategy, promoteActionStrategy, rowClazz, actionPayloadClazz,
|
||||
shouldGroupById)
|
||||
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
|
||||
|
||||
saveGraphTable(result, outputGraphTablePath);
|
||||
|
@ -141,12 +151,17 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
SparkSession spark, String path, Class<G> rowClazz) {
|
||||
logger.info("Reading graph table from path: {}", path);
|
||||
|
||||
return spark
|
||||
.read()
|
||||
.textFile(path)
|
||||
.map(
|
||||
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
|
||||
Encoders.bean(rowClazz));
|
||||
if (HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(path)
|
||||
.map(
|
||||
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
|
||||
Encoders.bean(rowClazz));
|
||||
} else {
|
||||
logger.info("Found empty graph table from path: {}", path);
|
||||
return spark.emptyDataset(Encoders.bean(rowClazz));
|
||||
}
|
||||
}
|
||||
|
||||
private static <A extends Oaf> Dataset<A> readActionPayload(
|
||||
|
@ -183,7 +198,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
private static <G extends Oaf, A extends Oaf> Dataset<G> promoteActionPayloadForGraphTable(
|
||||
Dataset<G> rowDS,
|
||||
Dataset<A> actionPayloadDS,
|
||||
MergeAndGet.Strategy strategy,
|
||||
MergeAndGet.Strategy mergeAndGetStrategy,
|
||||
PromoteAction.Strategy promoteActionStrategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz,
|
||||
Boolean shouldGroupById) {
|
||||
|
@ -195,8 +211,9 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
|
||||
SerializableSupplier<Function<G, String>> rowIdFn = ModelSupport::idFn;
|
||||
SerializableSupplier<Function<A, String>> actionPayloadIdFn = ModelSupport::idFn;
|
||||
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
|
||||
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
|
||||
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet
|
||||
.functionFor(mergeAndGetStrategy);
|
||||
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(mergeAndGetStrategy);
|
||||
SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
|
||||
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;
|
||||
|
||||
|
@ -207,10 +224,11 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
rowIdFn,
|
||||
actionPayloadIdFn,
|
||||
mergeRowWithActionPayloadAndGetFn,
|
||||
promoteActionStrategy,
|
||||
rowClazz,
|
||||
actionPayloadClazz);
|
||||
|
||||
if (shouldGroupById) {
|
||||
if (Boolean.TRUE.equals(shouldGroupById)) {
|
||||
return PromoteActionPayloadFunctions
|
||||
.groupGraphTableByIdAndMerge(
|
||||
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
|
||||
|
@ -237,6 +255,8 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation());
|
||||
case "eu.dnetlib.dhp.schema.oaf.Software":
|
||||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software());
|
||||
case "eu.dnetlib.dhp.schema.oaf.Person":
|
||||
return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Person());
|
||||
default:
|
||||
throw new RuntimeException("unknown class: " + clazz.getCanonicalName());
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ public class PromoteActionPayloadFunctions {
|
|||
* @param rowIdFn Function used to get the id of graph table row
|
||||
* @param actionPayloadIdFn Function used to get id of action payload instance
|
||||
* @param mergeAndGetFn Function used to merge graph table row and action payload instance
|
||||
* @param promoteActionStrategy the Actionset promotion strategy
|
||||
* @param rowClazz Class of graph table
|
||||
* @param actionPayloadClazz Class of action payload
|
||||
* @param <G> Type of graph table row
|
||||
|
@ -46,9 +47,10 @@ public class PromoteActionPayloadFunctions {
|
|||
SerializableSupplier<Function<G, String>> rowIdFn,
|
||||
SerializableSupplier<Function<A, String>> actionPayloadIdFn,
|
||||
SerializableSupplier<BiFunction<G, A, G>> mergeAndGetFn,
|
||||
PromoteAction.Strategy promoteActionStrategy,
|
||||
Class<G> rowClazz,
|
||||
Class<A> actionPayloadClazz) {
|
||||
if (!isSubClass(rowClazz, actionPayloadClazz)) {
|
||||
if (Boolean.FALSE.equals(isSubClass(rowClazz, actionPayloadClazz))) {
|
||||
throw new RuntimeException(
|
||||
"action payload type must be the same or be a super type of table row type");
|
||||
}
|
||||
|
@ -61,7 +63,7 @@ public class PromoteActionPayloadFunctions {
|
|||
.joinWith(
|
||||
actionPayloadWithIdDS,
|
||||
rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")),
|
||||
"full_outer")
|
||||
PromoteAction.joinTypeForStrategy(promoteActionStrategy))
|
||||
.map(
|
||||
(MapFunction<Tuple2<Tuple2<String, G>, Tuple2<String, A>>, G>) value -> {
|
||||
Optional<G> rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2);
|
||||
|
|
|
@ -41,6 +41,12 @@
|
|||
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pas",
|
||||
"paramLongName": "promoteActionStrategy",
|
||||
"paramDescription": "strategy for promoting the actionset contents into the graph tables, ENRICH or UPSERT (default)",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "sgid",
|
||||
"paramLongName": "shouldGroupById",
|
||||
|
|
|
@ -103,6 +103,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -115,6 +116,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
|
||||
|
@ -155,6 +157,7 @@
|
|||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -167,6 +170,7 @@
|
|||
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
|
||||
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
|
||||
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
|
||||
<arg>--promoteActionStrategy</arg><arg>${promoteActionStrategy}</arg>
|
||||
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue