Compare commits

..

No commits in common. "main" and "merge_by_id_fix" have entirely different histories.

168 changed files with 2572 additions and 7079 deletions

1
.gitignore vendored
View File

@ -27,4 +27,3 @@ spark-warehouse
/**/.factorypath /**/.factorypath
/**/.scalafmt.conf /**/.scalafmt.conf
/.java-version /.java-version
/dhp-shade-package/dependency-reduced-pom.xml

View File

@ -80,15 +80,7 @@ class WritePredefinedProjectPropertiesTest {
mojo.outputFile = testFolder; mojo.outputFile = testFolder;
// execute // execute
try { Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
mojo.execute();
Assertions.assertTrue(false); // not reached
} catch (Exception e) {
Assertions
.assertTrue(
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
}
} }
@Test @Test

View File

@ -38,7 +38,7 @@ public class PacePerson {
PacePerson.class PacePerson.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt"))); "/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (Exception e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }

View File

@ -12,7 +12,9 @@ import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.http.HttpHeaders; import org.apache.http.HttpHeaders;
import org.joda.time.Instant;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -212,11 +214,11 @@ public class HttpConnector2 {
.format( .format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(), "Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report))); MAPPER.writeValueAsString(report)));
} catch (MalformedURLException e) { } catch (MalformedURLException | UnknownHostException e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e); throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException | UnknownHostException e) { } catch (SocketTimeoutException | SocketException e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage()); report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);

View File

@ -1,70 +0,0 @@
/*
* Copyright (c) 2024.
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
public class HashableStructuredProperty extends StructuredProperty {
private static final long serialVersionUID = 8371670185221126045L;
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
if (value == null) {
return null;
}
final HashableStructuredProperty sp = new HashableStructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
HashableStructuredProperty hsp = new HashableStructuredProperty();
hsp.setQualifier(sp.getQualifier());
hsp.setValue(sp.getValue());
hsp.setQualifier(sp.getQualifier());
return hsp;
}
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
StructuredProperty sp = new StructuredProperty();
sp.setQualifier(hsp.getQualifier());
sp.setValue(hsp.getValue());
sp.setQualifier(hsp.getQualifier());
return sp;
}
@Override
public int hashCode() {
return new HashCodeBuilder(11, 91)
.append(getQualifier().getClassid())
.append(getQualifier().getSchemeid())
.append(getValue())
.hashCode();
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (obj == this) {
return true;
}
if (obj.getClass() != getClass()) {
return false;
}
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
return new EqualsBuilder()
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
.append(getValue(), rhs.getValue())
.isEquals();
}
}

View File

@ -43,4 +43,34 @@ public class CleaningFunctions {
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue); return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
} }
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
pid
.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
return pid;
}
public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional
.ofNullable(pidValue)
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pidType) {
// TODO add cleaning for more PID types as needed
case "doi":
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
}
return value;
}
} }

View File

@ -6,11 +6,18 @@ import org.apache.commons.lang3.StringUtils;
public class DoiCleaningRule { public class DoiCleaningRule {
public static String clean(final String doi) { public static String clean(final String doi) {
if (doi == null) return doi
return null; .toLowerCase()
final String replaced = doi .replaceAll("\\s", "")
.replaceAll("\\n|\\r|\\t|\\s", "")
.replaceAll("^doi:", "") .replaceAll("^doi:", "")
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
}
public static String normalizeDoi(final String input) {
if (input == null)
return null;
final String replaced = input
.replaceAll("\\n|\\r|\\t|\\s", "")
.toLowerCase() .toLowerCase()
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX); .replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
if (StringUtils.isEmpty(replaced)) if (StringUtils.isEmpty(replaced))
@ -25,6 +32,7 @@ public class DoiCleaningRule {
return null; return null;
return ret; return ret;
} }
} }

View File

@ -563,24 +563,12 @@ public class GraphCleaningFunctions extends CleaningFunctions {
Optional Optional
.ofNullable(i.getPid()) .ofNullable(i.getPid())
.ifPresent(pid -> { .ifPresent(pid -> {
final Set<HashableStructuredProperty> pids = pid final Set<StructuredProperty> pids = Sets.newHashSet(pid);
.stream()
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
Optional Optional
.ofNullable(i.getAlternateIdentifier()) .ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> { .ifPresent(altId -> {
final Set<HashableStructuredProperty> altIds = altId final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
.stream() i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
i
.setAlternateIdentifier(
Sets
.difference(altIds, pids)
.stream()
.map(HashableStructuredProperty::toStructuredProperty)
.collect(Collectors.toList()));
}); });
}); });

View File

@ -175,7 +175,7 @@ public class IdentifierFactory implements Serializable {
return entity return entity
.getPid() .getPid()
.stream() .stream()
.map(PidCleaner::normalizePidValue) .map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter) .filter(CleaningFunctions::pidFilter)
.collect( .collect(
Collectors Collectors
@ -207,7 +207,7 @@ public class IdentifierFactory implements Serializable {
// filter away PIDs provided by a DS that is not considered an authority for the // filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type // given PID Type
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles)) .filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
.map(PidCleaner::normalizePidValue) .map(CleaningFunctions::normalizePidValue)
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p)) .filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
.filter(CleaningFunctions::pidFilter)) .filter(CleaningFunctions::pidFilter))
.orElse(Stream.empty()); .orElse(Stream.empty());

View File

@ -96,7 +96,7 @@ public class MergeEntitiesComparator implements Comparator<Oaf> {
// id // id
if (res == 0) { if (res == 0) {
if (left instanceof OafEntity && right instanceof OafEntity) { if (left instanceof OafEntity && right instanceof OafEntity) {
res = ((OafEntity) right).getId().compareTo(((OafEntity) left).getId()); res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
} }
} }

View File

@ -468,10 +468,6 @@ public class MergeUtils {
merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal())); merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal()));
merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded())); merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded()));
if (StringUtils.isBlank(merge.getTransformativeAgreement())) {
merge.setTransformativeAgreement(enrich.getTransformativeAgreement());
}
return merge; return merge;
} }
@ -975,7 +971,7 @@ public class MergeUtils {
private static String extractKeyFromPid(final StructuredProperty pid) { private static String extractKeyFromPid(final StructuredProperty pid) {
if (pid == null) if (pid == null)
return null; return null;
final StructuredProperty normalizedPid = PidCleaner.normalizePidValue(pid); final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue()); return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
} }

View File

@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
if (right == null) if (right == null)
return -1; return -1;
StructuredProperty l = PidCleaner.normalizePidValue(left); StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = PidCleaner.normalizePidValue(right); StructuredProperty r = CleaningFunctions.normalizePidValue(right);
return Optional return Optional
.ofNullable(l.getValue()) .ofNullable(l.getValue())

View File

@ -28,7 +28,6 @@ import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import net.minidev.json.JSONArray; import net.minidev.json.JSONArray;
import scala.collection.JavaConverters; import scala.collection.JavaConverters;
import scala.collection.Seq; import scala.collection.Seq;
@ -105,7 +104,7 @@ public class DHPUtils {
public static String generateUnresolvedIdentifier(final String pid, final String pidType) { public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid); final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim()); return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
} }

View File

@ -154,13 +154,5 @@
"unknown":{ "unknown":{
"original":"Unknown", "original":"Unknown",
"inverse":"Unknown" "inverse":"Unknown"
},
"isamongtopnsimilardocuments": {
"original": "IsAmongTopNSimilarDocuments",
"inverse": "HasAmongTopNSimilarDocuments"
},
"hasamongtopnsimilardocuments": {
"original": "HasAmongTopNSimilarDocuments",
"inverse": "IsAmongTopNSimilarDocuments"
} }
} }

View File

@ -65,13 +65,12 @@ abstract class AbstractScalaApplication(
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val master = parser.get("master") val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master") log.info(s"Creating Spark session: Master: $master")
val b = SparkSession SparkSession
.builder() .builder()
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
if (master != null) .master(master)
b.master(master) .getOrCreate()
b.getOrCreate()
} }
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {

View File

@ -65,11 +65,7 @@ object ScholixUtils extends Serializable {
} }
def generateScholixResourceFromResult(r: Result): ScholixResource = { def generateScholixResourceFromResult(r: Result): ScholixResource = {
val sum = ScholixUtils.resultToSummary(r) generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
if (sum != null)
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
else
null
} }
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
@ -157,14 +153,6 @@ object ScholixUtils extends Serializable {
} }
def invRel(rel: String): String = {
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
if (semanticRelation != null)
semanticRelation.inverse
else
null
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = { def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) { if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d => val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
@ -389,7 +377,10 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty) if (persistentIdentifiers.isEmpty)
return null return null
s.setLocalIdentifier(persistentIdentifiers.asJava) s.setLocalIdentifier(persistentIdentifiers.asJava)
// s.setTypology(r.getResulttype.getClassid) if (r.isInstanceOf[Publication])
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

View File

@ -29,7 +29,7 @@ class IdentifierFactoryTest {
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); "publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier( verifyIdentifier(
"publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true); "publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
verifyIdentifier( verifyIdentifier(
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true); "publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
@ -41,7 +41,7 @@ class IdentifierFactoryTest {
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true); "publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
verifyIdentifier( verifyIdentifier(
"publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true); "publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
verifyIdentifier( verifyIdentifier(
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); "publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);

View File

@ -177,7 +177,7 @@ class OafMapperUtilsTest {
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals( assertEquals(
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, ModelConstants.DATASET_RESULTTYPE_CLASSID,
((Result) MergeUtils ((Result) MergeUtils
.merge(p2, d1)) .merge(p2, d1))
.getResulttype() .getResulttype()

View File

@ -29,7 +29,7 @@
}, },
{ {
"qualifier": {"classid": "pmc"}, "qualifier": {"classid": "pmc"},
"value": "PMC21459329" "value": "21459329"
} }
] ]
} }

View File

@ -13,7 +13,7 @@
}, },
{ {
"qualifier":{"classid":"pmc"}, "qualifier":{"classid":"pmc"},
"value":"PMC21459329" "value":"21459329"
} }
] ]
} }

View File

@ -24,7 +24,7 @@
<executions> <executions>
<execution> <execution>
<id>scala-compile-first</id> <id>scala-compile-first</id>
<phase>process-resources</phase> <phase>initialize</phase>
<goals> <goals>
<goal>add-source</goal> <goal>add-source</goal>
<goal>compile</goal> <goal>compile</goal>
@ -59,6 +59,14 @@
<groupId>edu.cmu</groupId> <groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId> <artifactId>secondstring</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
@ -83,6 +91,10 @@
<groupId>com.fasterxml.jackson.core</groupId> <groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId> <artifactId>jackson-databind</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>
@ -101,90 +113,4 @@
</dependency> </dependency>
</dependencies> </dependencies>
<profiles>
<profile>
<id>spark-24</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-34</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-2</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>spark-35</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<version>3.4.0</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>src/main/spark-35</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</project> </project>

View File

@ -2,41 +2,31 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("legalnameclustering") @ClusteringClass("keywordsclustering")
public class LegalnameClustering extends AbstractClusteringFunction { public class KeywordsClustering extends AbstractClusteringFunction {
private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+"); public KeywordsClustering(Map<String, Object> params) {
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
public LegalnameClustering(Map<String, Object> params) {
super(params); super(params);
} }
public Set<String> getRegexList(String input, Pattern codeRegex) {
Matcher matcher = codeRegex.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override @Override
protected Collection<String> doApply(final Config conf, String s) { protected Collection<String> doApply(final Config conf, String s) {
// takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
// list of combination to return as result // list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) { for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
for (String city : getRegexList(s, CITY_CODE_PATTERN)) { for (String city : citiesToCodes(cities)) {
combinations.add(keyword + "-" + city); combinations.add(keyword + "-" + city);
if (combinations.size() >= paramOrDefault("max", 2)) { if (combinations.size() >= paramOrDefault("max", 2)) {
return combinations; return combinations;
@ -52,6 +42,9 @@ public class LegalnameClustering extends AbstractClusteringFunction {
return fields return fields
.stream() .stream()
.filter(f -> !f.isEmpty()) .filter(f -> !f.isEmpty())
.map(KeywordsClustering::cleanup)
.map(KeywordsClustering::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s)) .map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream()) .flatMap(c -> c.stream())

View File

@ -27,14 +27,6 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
private static Map<String, String> cityMap = AbstractPaceFunctions private static Map<String, String> cityMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); .loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
// keywords map to be used when translating the keyword names into codes
private static Map<String, String> keywordMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
// country map to be used when inferring the country from the city name
private static Map<String, String> countryMap = AbstractPaceFunctions
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
// list of stopwords in different languages // list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
@ -82,64 +74,6 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return s12; return s12;
} }
public static String countryInference(final String original, String inferFrom) {
if (!original.equalsIgnoreCase("unknown"))
return original;
inferFrom = cleanup(inferFrom);
inferFrom = normalize(inferFrom);
inferFrom = filterAllStopWords(inferFrom);
Set<String> cities = getCities(inferFrom, 4);
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
}
public static String cityInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> cities = getCities(original, 4);
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
public static String keywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
return original;
}
public static String cityKeywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
Set<String> cities = getCities(original, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
protected static String fixXML(final String a) { protected static String fixXML(final String a) {
return a return a
@ -274,30 +208,6 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return m; return m;
} }
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
// string is like this: country_code;city1;city2;city3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
String code = cityMap.get(city);
m.put(code, value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
public static String removeKeywords(String s, Set<String> keywords) { public static String removeKeywords(String s, Set<String> keywords) {
s = " " + s + " "; s = " " + s + " ";
@ -327,10 +237,6 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return toCodes(keywords, cityMap); return toCodes(keywords, cityMap);
} }
public static Set<String> citiesToCountry(Set<String> cities) {
return toCodes(toCodes(cities, cityMap), countryMap);
}
protected static String firstLC(final String s) { protected static String firstLC(final String s) {
return StringUtils.substring(s, 0, 1).toLowerCase(); return StringUtils.substring(s, 0, 1).toLowerCase();
} }

View File

@ -47,21 +47,9 @@ public class FieldDef implements Serializable {
private String clean; private String clean;
private String infer;
private String inferenceFrom;
public FieldDef() { public FieldDef() {
} }
public String getInferenceFrom() {
return inferenceFrom;
}
public void setInferenceFrom(final String inferenceFrom) {
this.inferenceFrom = inferenceFrom;
}
public String getName() { public String getName() {
return name; return name;
} }
@ -138,14 +126,6 @@ public class FieldDef implements Serializable {
this.clean = clean; this.clean = clean;
} }
public String getInfer() {
return infer;
}
public void setInfer(String infer) {
this.infer = infer;
}
@Override @Override
public String toString() { public String toString() {
try { try {

View File

@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath} import com.jayway.jsonpath.{Configuration, JsonPath}
import eu.dnetlib.pace.common.AbstractPaceFunctions import eu.dnetlib.pace.common.AbstractPaceFunctions
import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils} import eu.dnetlib.pace.util.MapDocumentUtil
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema)) df.map(r => rowFromJson(r))(RowEncoder(schema))
} }
def rowFromJson(json: String): Row = { def rowFromJson(json: String): Row = {
@ -123,19 +123,9 @@ case class SparkModel(conf: DedupConfig) {
case _ => res(index) case _ => res(index)
} }
} }
if (StringUtils.isNotBlank(fdef.getInfer)) {
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
res(index) = res(index) match {
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
}
}
} }
res res
} }
new GenericRowWithSchema(values, schema) new GenericRowWithSchema(values, schema)
@ -156,17 +146,5 @@ case class SparkModel(conf: DedupConfig) {
res res
} }
def inference(value: String, inferfrom: String, infertype: String) : String = {
val res = infertype match {
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
case "city" => AbstractPaceFunctions.cityInference(value)
case "keyword" => AbstractPaceFunctions.keywordInference(value)
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
case _ => value
}
res
}
} }

View File

@ -0,0 +1,48 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator {
private Map<String, String> params;
public CityMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -1,51 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("codeMatch")
public class CodeMatch extends AbstractStringComparator {
private Map<String, String> params;
private Pattern CODE_REGEX;
public CodeMatch(Map<String, String> params) {
super(params);
this.params = params;
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
}
public Set<String> getRegexList(String input) {
Matcher matcher = this.CODE_REGEX.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override
public double distance(final String a, final String b, final Config conf) {
Set<String> codes1 = getRegexList(a);
Set<String> codes2 = getRegexList(b);
// if no codes are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no codes
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -2,7 +2,6 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map; import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
@ -13,11 +12,8 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("countryMatch") @ComparatorClass("countryMatch")
public class CountryMatch extends AbstractStringComparator { public class CountryMatch extends AbstractStringComparator {
private Map<String, String> params;
public CountryMatch(Map<String, String> params) { public CountryMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler()); super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
} }
public CountryMatch(final double weight) { public CountryMatch(final double weight) {
@ -30,7 +26,6 @@ public class CountryMatch extends AbstractStringComparator {
@Override @Override
public double distance(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) { if (a.isEmpty() || b.isEmpty()) {
return -1.0; // return -1 if a field is missing return -1.0; // return -1 if a field is missing
} }
@ -50,5 +45,4 @@ public class CountryMatch extends AbstractStringComparator {
protected double normalize(final double d) { protected double normalize(final double d) {
return d; return d;
} }
} }

View File

@ -1,59 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerLegalname")
public class JaroWinklerLegalname extends AbstractStringComparator {
private Map<String, String> params;
private final String CITY_CODE_REGEX = "city::\\d+";
private final String KEYWORD_CODE_REGEX = "key::\\d+";
public JaroWinklerLegalname(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerLegalname(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,74 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map<String, String> params;
public JaroWinklerNormalizedName(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,50 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractStringComparator {
Map<String, String> params;
public KeywordMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; // undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}
}

File diff suppressed because one or more lines are too long

View File

@ -1,12 +0,0 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
RowEncoder(schema)
}
}

View File

@ -1,12 +0,0 @@
package eu.dnetlib.pace.util
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types.StructType
object SparkCompatUtils {
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
ExpressionEncoder(schema)
}
}

View File

@ -8,7 +8,6 @@ import org.junit.jupiter.api.Test;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.mongodb.connection.Cluster;
import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
@ -178,16 +177,41 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
} }
@Test @Test
public void legalnameClustering() { public void testKeywordsClustering() {
final ClusteringFunction cf = new LegalnameClustering(params); final ClusteringFunction cf = new KeywordsClustering(params);
String s = "key::1 key::2 city::1"; final String s = "Polytechnic University of Turin";
System.out.println(s); System.out.println(s);
System.out.println(cf.apply(conf, Lists.newArrayList(s))); System.out.println(cf.apply(conf, Lists.newArrayList(s)));
s = "key::1 key::2 city::1 city::2"; final String s1 = "POLITECNICO DI TORINO";
System.out.println(s); System.out.println(s1);
System.out.println(cf.apply(conf, Lists.newArrayList(s))); System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2);
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3);
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4);
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
final String s6 = "National and Kapodistrian University of Athens";
System.out.println("s6 = " + s6);
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
System.out.println("s7 = " + s7);
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
} }
@Test @Test

View File

@ -54,47 +54,4 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
} }
@Test
public void countryInferenceTest() {
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
assertEquals("UK", countryInference("UK", "Università di Bologna"));
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
}
@Test
public void cityInferenceTest() {
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
assertEquals("university city::3170647", cityInference("University of Pisa"));
assertEquals("universita", cityInference("Università del lavoro"));
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
}
@Test
public void keywordInferenceTest() {
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 milano bergamo",
keywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio athenon",
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
@Test
public void cityKeywordInferenceTest() {
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 city::3173435 city::3182164",
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals(
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio city::264371",
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
} }

View File

@ -35,7 +35,6 @@ public class ComparatorTest extends AbstractPaceTest {
params.put("name_th", "0.95"); params.put("name_th", "0.95");
params.put("jpath_value", "$.value"); params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid"); params.put("jpath_classid", "$.qualifier.classid");
params.put("codeRegex", "key::\\d+");
} }
@Test @Test
@ -45,23 +44,52 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void codeMatchTest() { public void cityMatchTest() {
CodeMatch codeMatch = new CodeMatch(params); final CityMatch cityMatch = new CityMatch(params);
// both names with no codes // both names with no cities
assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf)); assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
// one of the two names with no codes // one of the two names with no cities
assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf)); assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
// both names with codes (same) // both names with cities (same)
assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf)); assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
// both names with codes (different) // both names with cities (different)
assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf)); assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
// both names with codes (1 same, 1 different) // particular cases
assertEquals(0.5, codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf)); assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(
1.0,
cityMatch
.distance(
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
conf));
// failing becasuse 'Allen' is a transliterrated greek stopword
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
}
@Test
public void keywordMatchTest() {
params.put("threshold", "0.5");
final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
} }
@ -127,15 +155,15 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void jaroWinklerLegalnameTest() { public void jaroWinklerNormalizedNameTest() {
final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerLegalname double result = jaroWinklerNormalizedName
.distance("AT&T (United States)", "United States key::2 key::1", conf); .distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf); result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@ -316,13 +344,13 @@ public class ComparatorTest extends AbstractPaceTest {
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf); double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
assertEquals(-1.0, result); assertEquals(-1.0, result);
result = countryMatch.distance("CL", "UNKNOWN", conf); result = countryMatch.distance("CHILE", "UNKNOWN", conf);
assertEquals(-1.0, result); assertEquals(-1.0, result);
result = countryMatch.distance("CL", "IT", conf); result = countryMatch.distance("CHILE", "ITALY", conf);
assertEquals(0.0, result); assertEquals(0.0, result);
result = countryMatch.distance("CL", "CL", conf); result = countryMatch.distance("CHILE", "CHILE", conf);
assertEquals(1.0, result); assertEquals(1.0, result);
} }

View File

@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest { public class UtilTest {

View File

@ -1,169 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-shade-package</artifactId>
<packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module create a jar of all module dependencies</description>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-actionmanager</artifactId>
<version>${project.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-aggregation</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-blacklist</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-broker-events</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
<!-- <artifactId>dhp-enrichment</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- </dependency>-->
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-mapper</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-provision</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-impact-indicators</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-actionsets</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-hist-snaps</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-monitor-irish</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-promote</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-stats-update</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-swh</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-usage-raw-data-update</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-usage-stats-build</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
</transformer>
<!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/cxf/bus-extensions.txt</resource>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/maven/**</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<relocations>
<relocation>
<pattern>com</pattern>
<shadedPattern>repackaged.com.google.common</shadedPattern>
<includes>
<include>com.google.common.**</include>
</includes>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -51,5 +51,48 @@
<artifactId>hadoop-distcp</artifactId> <artifactId>hadoop-distcp</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable; import java.io.Serializable;
import java.io.StringReader; import java.io.StringReader;
import java.util.List; import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -21,6 +22,7 @@ import com.google.common.base.Splitter;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -63,7 +65,7 @@ public class ISClient implements Serializable {
.map(t -> buildDirectory(basePath, t)) .map(t -> buildDirectory(basePath, t))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElseThrow(() -> new IllegalStateException("empty set list")); .orElseThrow(() -> new IllegalStateException("empty set list"));
} catch (ISLookUpException e) { } catch (ActionManagerException | ISLookUpException e) {
throw new IllegalStateException("unable to query ActionSets info from the IS"); throw new IllegalStateException("unable to query ActionSets info from the IS");
} }
} }
@ -87,18 +89,31 @@ public class ISClient implements Serializable {
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight()); return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
} }
private String getBasePathHDFS(ISLookUpService isLookup) throws ISLookUpException { private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException {
return queryServiceProperty(isLookup, "basePath"); return queryServiceProperty(isLookup, "basePath");
} }
private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) private String queryServiceProperty(ISLookUpService isLookup, final String propertyName)
throws ISLookUpException { throws ActionManagerException {
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
+ propertyName + propertyName
+ "']/@value/string()"; + "']/@value/string()";
log.debug("quering for service property: {}", q); log.debug("quering for service property: {}", q);
try {
final List<String> value = isLookup.quickSearchProfile(q); final List<String> value = isLookup.quickSearchProfile(q);
return Iterables.getOnlyElement(value); return Iterables.getOnlyElement(value);
} catch (ISLookUpException e) {
String msg = "Error accessing service profile, using query: " + q;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (NoSuchElementException e) {
String msg = "missing service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (IllegalArgumentException e) {
String msg = "found more than one service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
}
} }
} }

View File

@ -10,6 +10,7 @@ import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
@ -28,7 +29,6 @@ import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.DoiCleaningRule;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
@ -46,8 +46,6 @@ public class PrepareAffiliationRelations implements Serializable {
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation"; public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556"; public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE"; public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
public static final String DOI_URL_PREFIX = "https://doi.org/";
public static final int DOI_URL_PREFIX_LENGTH = 16;
public static <I extends Result> void main(String[] args) throws Exception { public static <I extends Result> void main(String[] args) throws Exception {
@ -100,26 +98,35 @@ public class PrepareAffiliationRelations implements Serializable {
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath, private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath, String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
String outputPath) { String outputPath) {
List<KeyValue> collectedfromOpenAIRE = OafMapperUtils List<KeyValue> collectedFromCrossref = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); .listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel( spark, crossrefInputPath, collectedFromCrossref);
spark, crossrefInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromPubmed = OafMapperUtils
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
spark, pubmedInputPath, collectedfromOpenAIRE); spark, pubmedInputPath, collectedFromPubmed);
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel( List<KeyValue> collectedFromOpenAPC = OafMapperUtils
spark, openapcInputPath, collectedfromOpenAIRE); .listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedfromOpenAIRE); spark, dataciteInputPath, collectedFromDatacite);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedfromOpenAIRE); spark, webcrawlInputPath, collectedFromWebCrawl);
List<KeyValue> collectedfromPublisher = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher( JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
spark, publisherlInputPath, collectedfromOpenAIRE); spark, publisherlInputPath, collectedfromPublisher);
crossrefRelations crossrefRelations
.union(pubmedRelations) .union(pubmedRelations)
@ -131,21 +138,6 @@ public class PrepareAffiliationRelations implements Serializable {
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
} }
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath, private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
List<KeyValue> collectedfrom) { List<KeyValue> collectedfrom) {
@ -173,20 +165,6 @@ public class PrepareAffiliationRelations implements Serializable {
return getTextTextJavaPairRDD(collectedfrom, df); return getTextTextJavaPairRDD(collectedfrom, df);
} }
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
// load and parse affiliation relations from HDFS
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDDNew(collectedfrom, df);
}
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) { private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays // unroll nested arrays
df = df df = df
@ -203,7 +181,7 @@ public class PrepareAffiliationRelations implements Serializable {
// DOI to OpenAIRE id // DOI to OpenAIRE id
final String paperId = ID_PREFIX final String paperId = ID_PREFIX
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi")))); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
// ROR id to OpenAIRE id // ROR id to OpenAIRE id
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid")); final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
@ -235,69 +213,6 @@ public class PrepareAffiliationRelations implements Serializable {
new Text(OBJECT_MAPPER.writeValueAsString(aa)))); new Text(OBJECT_MAPPER.writeValueAsString(aa))));
} }
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays
df = df
.withColumn("matching", functions.explode(new Column("Matchings")))
.select(
new Column("DOI").as("doi"),
new Column("matching.PID").as("pidtype"),
new Column("matching.Value").as("pidvalue"),
new Column("matching.Confidence").as("confidence"),
new Column("matching.Status").as("status"))
.where("status = 'active'");
// prepare action sets for affiliation relations
return df
.toJavaRDD()
.flatMap((FlatMapFunction<Row, Relation>) row -> {
// DOI to OpenAIRE id
final String paperId = ID_PREFIX
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
// Organization to OpenAIRE identifier
String affId = null;
if (row.getAs("pidtype").equals("ROR"))
// ROR id to OpenIARE id
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
else
// getting the OpenOrgs identifier for the organization
affId = row.getAs("pidvalue");
Qualifier qualifier = OafMapperUtils
.qualifier(
BIP_AFFILIATIONS_CLASSID,
BIP_AFFILIATIONS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS);
// format data info; setting `confidence` into relation's `trust`
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
BIP_INFERENCE_PROVENANCE,
true,
false,
qualifier,
Double.toString(row.getAs("confidence")));
// return bi-directional relations
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
})
.map(p -> new AtomicAction(Relation.class, p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
}
private static String removePrefix(String doi) {
if (doi.startsWith(DOI_URL_PREFIX))
return doi.substring(DOI_URL_PREFIX_LENGTH);
return doi;
}
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom, private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
DataInfo dataInfo) { DataInfo dataInfo) {
return Arrays return Arrays

View File

@ -10,7 +10,6 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -84,7 +83,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
resultsRDD resultsRDD
.union(projectsRDD) .union(projectsRDD)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}); });
} }

View File

@ -6,23 +6,26 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset;
import org.jetbrains.annotations.NotNull; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareSDGSparkJob implements Serializable { public class PrepareSDGSparkJob implements Serializable {
@ -49,91 +52,42 @@ public class PrepareSDGSparkJob implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final Boolean distributeDOI = Optional
.ofNullable(parser.get("distributeDoi"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("distribute doi {}", distributeDOI);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (distributeDOI) doPrepare(
doPrepare( spark,
spark, sourcePath,
sourcePath,
outputPath);
else
doPrepareoaid(spark, sourcePath, outputPath);
outputPath);
}); });
} }
private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) { private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) {
Dataset<Row> sdgDataset = spark Dataset<SDGDataModel> sdgDataset = readPath(spark, sourcePath, SDGDataModel.class);
.read()
.format("csv")
.option("sep", DEFAULT_DELIMITER)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(sourcePath);
sdgDataset sdgDataset
.groupByKey((MapFunction<Row, String>) v -> ((String) v.getAs("doi")).toLowerCase(), Encoders.STRING()) .groupByKey((MapFunction<SDGDataModel, String>) r -> r.getDoi().toLowerCase(), Encoders.STRING())
.mapGroups( .mapGroups((MapGroupsFunction<String, SDGDataModel, Result>) (k, it) -> {
(MapGroupsFunction<String, Row, Result>) (k, Result r = new Result();
it) -> getResult( r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
DHPUtils SDGDataModel first = it.next();
.generateUnresolvedIdentifier( List<Subject> sbjs = new ArrayList<>();
ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k, sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
DOI), it
it), .forEachRemaining(
Encoders.bean(Result.class)) s -> sbjs
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
r.setSubject(sbjs);
return r;
}, Encoders.bean(Result.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath + "/sdg"); .json(outputPath + "/sdg");
} }
private static void doPrepareoaid(SparkSession spark, String sourcePath, String outputPath) {
Dataset<Row> sdgDataset = spark
.read()
.format("csv")
.option("sep", DEFAULT_DELIMITER)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(sourcePath);
;
sdgDataset
.groupByKey((MapFunction<Row, String>) r -> "50|" + ((String) r.getAs("oaid")), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Row, Result>) PrepareSDGSparkJob::getResult, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/sdg");
}
private static @NotNull Result getResult(String id, Iterator<Row> it) {
Result r = new Result();
r.setId(id);
Row first = it.next();
List<Subject> sbjs = new ArrayList<>();
sbjs.add(getSubject(first.getAs("sdg"), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
it
.forEachRemaining(
s -> sbjs
.add(getSubject(s.getAs("sdg"), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
r.setSubject(sbjs);
return r;
}
} }

View File

@ -13,6 +13,9 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
@ -21,9 +24,13 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.*;
import scala.Tuple2; import scala.Tuple2;
public class CreateActionSetSparkJob implements Serializable { public class CreateActionSetSparkJob implements Serializable {

View File

@ -49,9 +49,6 @@ public class ReadCOCI implements Serializable {
final String workingPath = parser.get("inputPath"); final String workingPath = parser.get("inputPath");
log.info("workingPath {}", workingPath); log.info("workingPath {}", workingPath);
final String backupPath = parser.get("backupPath");
log.info("backupPath {}", backupPath);
SparkConf sconf = new SparkConf(); SparkConf sconf = new SparkConf();
Configuration conf = new Configuration(); Configuration conf = new Configuration();
@ -71,14 +68,12 @@ public class ReadCOCI implements Serializable {
workingPath, workingPath,
fileSystem, fileSystem,
outputPath, outputPath,
backupPath,
delimiter); delimiter);
}); });
} }
private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem, private static void doRead(SparkSession spark, String workingPath, FileSystem fileSystem,
String outputPath, String outputPath,
String backupPath,
String delimiter) throws IOException { String delimiter) throws IOException {
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles( .listFiles(
@ -112,8 +107,7 @@ public class ReadCOCI implements Serializable {
.mode(SaveMode.Append) .mode(SaveMode.Append)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
fileSystem.rename(fileStatus.getPath(), new Path("/tmp/miriam/OC/DONE"));
fileSystem.rename(fileStatus.getPath(), new Path(backupPath));
} }
} }

View File

@ -1,80 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class CoAuthorshipIterator implements Iterator<Relation> {
private int firstIndex;
private int secondIndex;
private boolean firstRelation;
private List<String> authors;
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______::";
private static final String OPENAIRE_PREFIX = "openaire____";
private static final String SEPARATOR = "::";
private static final String ORCID_KEY = "10|" + OPENAIRE_PREFIX + SEPARATOR
+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
@Override
public boolean hasNext() {
return firstIndex < authors.size() - 1;
}
@Override
public Relation next() {
Relation rel = null;
if (firstRelation) {
rel = getRelation(authors.get(firstIndex), authors.get(secondIndex));
firstRelation = Boolean.FALSE;
} else {
rel = getRelation(authors.get(secondIndex), authors.get(firstIndex));
firstRelation = Boolean.TRUE;
secondIndex += 1;
if (secondIndex >= authors.size()) {
firstIndex += 1;
secondIndex = firstIndex + 1;
}
}
return rel;
}
public CoAuthorshipIterator(List<String> authors) {
this.authors = authors;
this.firstIndex = 0;
this.secondIndex = 1;
this.firstRelation = Boolean.TRUE;
}
private Relation getRelation(String orcid1, String orcid2) {
String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1);
String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2);
return OafMapperUtils
.getRelation(
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
ModelConstants.PERSON_PERSON_SUBRELTYPE,
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(ORCID_KEY, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null);
}
}

View File

@ -1,20 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class Coauthors implements Serializable {
private List<String> coauthors;
public List<String> getCoauthors() {
return coauthors;
}
public void setCoauthors(List<String> coauthors) {
this.coauthors = coauthors;
}
}

View File

@ -1,40 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.io.Serializable;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class Couples implements Serializable {
Person p;
Relation r;
public Couples() {
}
public Person getP() {
return p;
}
public void setP(Person p) {
this.p = p;
}
public Relation getR() {
return r;
}
public void setR(Relation r) {
this.r = r;
}
public static <Tuples> Couples newInstance(Tuple2<Person, Relation> couple) {
Couples c = new Couples();
c.p = couple._1();
c.r = couple._2();
return c;
}
}

View File

@ -1,437 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.*;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.*;
import org.apache.spark.sql.*;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.spark_project.jetty.util.StringUtil;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.orcid.model.Author;
import eu.dnetlib.dhp.collection.orcid.model.Employment;
import eu.dnetlib.dhp.collection.orcid.model.Work;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class ExtractPerson implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String OPENAIRE_PREFIX = "openaire____";
private static final String SEPARATOR = "::";
private static final String orcidKey = "10|" + OPENAIRE_PREFIX + SEPARATOR
+ DHPUtils.md5(ModelConstants.ORCID.toLowerCase());
private static final String DOI_PREFIX = "50|doi_________::";
private static final String PMID_PREFIX = "50|pmid________::";
private static final String ARXIV_PREFIX = "50|arXiv_______::";
private static final String PMCID_PREFIX = "50|pmcid_______::";
private static final String ROR_PREFIX = "20|ror_________::";
private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______";
public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid";
public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID";
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
ExtractPerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir {}", workingDir);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
createActionSet(spark, inputPath, outputPath, workingDir);
});
}
private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) {
Dataset<Author> authors = spark
.read()
.parquet(inputPath + "Authors")
.as(Encoders.bean(Author.class));
Dataset<Work> works = spark
.read()
.parquet(inputPath + "Works")
.as(Encoders.bean(Work.class))
.filter(
(FilterFunction<Work>) w -> Optional.ofNullable(w.getPids()).isPresent() &&
w
.getPids()
.stream()
.anyMatch(
p -> p.getSchema().equalsIgnoreCase("doi") ||
p.getSchema().equalsIgnoreCase("pmc") ||
p.getSchema().equalsIgnoreCase("pmid") ||
p.getSchema().equalsIgnoreCase("arxiv")));
Dataset<Employment> employmentDataset = spark
.read()
.parquet(inputPath + "Employments")
.as(Encoders.bean(Employment.class));
Dataset<Author> peopleToMap = authors
.joinWith(works, authors.col("orcid").equalTo(works.col("orcid")))
.map((MapFunction<Tuple2<Author, Work>, Author>) t2 -> t2._1(), Encoders.bean(Author.class))
.groupByKey((MapFunction<Author, String>) a -> a.getOrcid(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Author, Author>) (k, it) -> it.next(), Encoders.bean(Author.class));
Dataset<Employment> employment = employmentDataset
.joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid")))
.map((MapFunction<Tuple2<Employment, Author>, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class));
Dataset<Person> people;
peopleToMap.map((MapFunction<Author, Person>) op -> {
Person person = new Person();
person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX));
person
.setBiography(
Optional
.ofNullable(op.getBiography())
.orElse(""));
KeyValue kv = OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS);
kv.setDataInfo(null);
person.setCollectedfrom(Arrays.asList(kv));
person
.setAlternativeNames(
Optional
.ofNullable(op.getOtherNames())
.orElse(new ArrayList<>()));
person
.setFamilyName(
Optional
.ofNullable(op.getFamilyName())
.orElse(""));
person
.setGivenName(
Optional
.ofNullable(op.getGivenName())
.orElse(""));
person
.setPid(
Optional
.ofNullable(op.getOtherPids())
.map(
v -> v
.stream()
.map(
p -> OafMapperUtils
.structuredProperty(
p.getValue(), p.getSchema(), p.getSchema(), ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES, null))
.collect(Collectors.toList()))
.orElse(new ArrayList<>()));
person
.getPid()
.add(
OafMapperUtils
.structuredProperty(
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null));
person.setDateofcollection(op.getLastModifiedDate());
person.setOriginalId(Arrays.asList(op.getOrcid()));
return person;
}, Encoders.bean(Person.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/people");
works
.flatMap(
(FlatMapFunction<Work, Relation>) ExtractPerson::getAuthorshipRelationIterator,
Encoders.bean(Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/authorship");
Dataset<Relation> coauthorship = works
.flatMap((FlatMapFunction<Work, Tuple2<String, String>>) w -> {
List<Tuple2<String, String>> lista = new ArrayList<>();
w.getPids().stream().forEach(p -> {
if (p.getSchema().equalsIgnoreCase("doi") || p.getSchema().equalsIgnoreCase("pmc")
|| p.getSchema().equalsIgnoreCase("pmid") || p.getSchema().equalsIgnoreCase("arxiv"))
lista.add(new Tuple2<>(p.getValue(), w.getOrcid()));
});
return lista.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.groupByKey((MapFunction<Tuple2<String, String>, String>) Tuple2::_1, Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Tuple2<String, String>, Coauthors>) (k, it) -> extractCoAuthors(it),
Encoders.bean(Coauthors.class))
.flatMap(
(FlatMapFunction<Coauthors, Relation>) c -> new CoAuthorshipIterator(c.getCoauthors()),
Encoders.bean(Relation.class))
.groupByKey((MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class));
coauthorship
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/coauthorship");
employment
.filter((FilterFunction<Employment>) e -> Optional.ofNullable(e.getAffiliationId()).isPresent())
.filter((FilterFunction<Employment>) e -> e.getAffiliationId().getSchema().equalsIgnoreCase("ror"))
.map(
(MapFunction<Employment, Relation>) ExtractPerson::getAffiliationRelation,
Encoders.bean(Relation.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(workingDir + "/affiliation");
people = spark
.read()
.textFile(workingDir + "/people")
.map(
(MapFunction<String, Person>) value -> OBJECT_MAPPER
.readValue(value, Person.class),
Encoders.bean(Person.class));
people.show(false);
people
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.union(
getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r)))
.union(
getRelations(spark, workingDir + "/coauthorship")
.toJavaRDD()
.map(r -> new AtomicAction(r.getClass(), r)))
.union(
getRelations(spark, workingDir + "/affiliation")
.toJavaRDD()
.map(r -> new AtomicAction(r.getClass(), r)))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}
private static Dataset<Relation> getRelations(SparkSession spark, String path) {
return spark
.read()
.textFile(path)
.map(
(MapFunction<String, Relation>) value -> OBJECT_MAPPER
.readValue(value, Relation.class),
Encoders.bean(Relation.class));// spark.read().json(path).as(Encoders.bean(Relation.class));
}
private static Coauthors extractCoAuthors(Iterator<Tuple2<String, String>> it) {
Coauthors coauth = new Coauthors();
List<String> coauthors = new ArrayList<>();
while (it.hasNext())
coauthors.add(it.next()._2());
coauth.setCoauthors(coauthors);
return coauth;
}
private static Relation getAffiliationRelation(Employment row) {
String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid());
String target = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue()));
List<KeyValue> properties = new ArrayList<>();
Relation relation = OafMapperUtils
.getRelation(
source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE,
ModelConstants.ORG_PERSON_PARTICIPATES,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null);
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
KeyValue kv = new KeyValue();
kv.setKey("startDate");
kv.setValue(row.getStartDate());
properties.add(kv);
}
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
KeyValue kv = new KeyValue();
kv.setKey("endDate");
kv.setValue(row.getEndDate());
properties.add(kv);
}
if (properties.size() > 0)
relation.setProperties(properties);
return relation;
}
private static Collection<? extends Relation> getCoAuthorshipRelations(String orcid1, String orcid2) {
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1);
String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2);
return Arrays
.asList(
OafMapperUtils
.getRelation(
source, target, ModelConstants.PERSON_PERSON_RELTYPE,
ModelConstants.PERSON_PERSON_SUBRELTYPE,
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null),
OafMapperUtils
.getRelation(
target, source, ModelConstants.PERSON_PERSON_RELTYPE,
ModelConstants.PERSON_PERSON_SUBRELTYPE,
ModelConstants.PERSON_PERSON_HASCOAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null));
}
private static @NotNull Iterator<Relation> getAuthorshipRelationIterator(Work w) {
if (Optional.ofNullable(w.getPids()).isPresent())
return w
.getPids()
.stream()
.map(pid -> getRelation(w.getOrcid(), pid))
.filter(Objects::nonNull)
.collect(Collectors.toList())
.iterator();
List<Relation> ret = new ArrayList<>();
return ret.iterator();
}
private static Relation getRelation(String orcid, eu.dnetlib.dhp.collection.orcid.model.Pid pid) {
String target;
String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid);
switch (pid.getSchema()) {
case "doi":
target = DOI_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), pid.getValue()));
break;
case "pmid":
target = PMID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), pid.getValue()));
break;
case "arxiv":
target = ARXIV_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.arXiv.toString(), pid.getValue()));
break;
case "pmcid":
target = PMCID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), pid.getValue()));
break;
default:
return null;
}
return OafMapperUtils
.getRelation(
source, target, ModelConstants.RESULT_PERSON_RELTYPE,
ModelConstants.RESULT_PERSON_SUBRELTYPE,
ModelConstants.RESULT_PERSON_HASAUTHORED,
Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)),
OafMapperUtils
.dataInfo(
false, null, false, false,
OafMapperUtils
.qualifier(
ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.91"),
null);
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.actionmanager.personentity;
import java.io.Serializable;
import java.util.ArrayList;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import eu.dnetlib.dhp.collection.orcid.model.Work;
public class WorkList implements Serializable {
private ArrayList<Work> workArrayList;
public ArrayList<Work> getWorkArrayList() {
return workArrayList;
}
public void setWorkArrayList(ArrayList<Work> workArrayList) {
this.workArrayList = workArrayList;
}
public WorkList() {
workArrayList = new ArrayList<>();
}
}

View File

@ -1,91 +0,0 @@
package eu.dnetlib.dhp.actionmanager.sdgnodoi;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.Hdfs;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Result;
import scala.Tuple2;
public class CreateActionSetSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
CreateActionSetSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/fosnodoi/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
createActionSet(spark, inputPath, outputPath);
});
}
private static void createActionSet(SparkSession spark, String inputPath, String outputPath) {
spark
.read()
.textFile(inputPath)
.map(
(MapFunction<String, Result>) value -> OBJECT_MAPPER.readValue(value, Result.class),
Encoders.bean(Result.class))
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
}

View File

@ -5,10 +5,11 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
@ -112,7 +113,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
.mapToPair( .mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa)))) new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
} }
@ -152,40 +153,11 @@ public class CreateActionSetFromWebEntries implements Serializable {
.select("OpenAlexId"); .select("OpenAlexId");
} }
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
if (pmcid == null)
return new ArrayList<>();
return createAffiliatioRelationPair(
PMCID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC", pmcid))),
ror);
}
private static List<Relation> createAffiliationRelationPairPMID(String pmid, String ror) {
if (pmid == null)
return new ArrayList<>();
return createAffiliatioRelationPair(
PMID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
ror);
}
private static String removeResolver(String pidType, String pid) { private static String removeResolver(String pidType, String pid) {
switch (pidType) { if (pidType.equals("DOI")) {
case "PMID": return pid.substring(16);
return pid.substring(33);
case "PMC":
return "PMC" + pid.substring(43);
case "DOI":
return pid.substring(16);
} }
throw new IllegalArgumentException("DOI is the only supported PID type");
throw new RuntimeException();
} }
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) { private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {

View File

@ -1,158 +0,0 @@
package eu.dnetlib.dhp.actionmanager.webcrawl;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.*;
import java.io.File;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Optional;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.DirectoryFileFilter;
import org.apache.commons.io.filefilter.FileFileFilter;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import scala.Tuple2;
public class RemoveRelationFromActionSet
implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CreateActionSetFromWebEntries.class);
private static final ObjectMapper MAPPER = new ObjectMapper();
private static final StructType KV_SCHEMA = StructType$.MODULE$
.apply(
Arrays
.asList(
StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()),
StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty())));
private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$
.apply(
Arrays
.asList(
StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()),
StructField$.MODULE$
.apply(
"payload", DataTypes.StringType, false, Metadata.empty())));
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CreateActionSetFromWebEntries.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// the actionSet path
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String blackListInputPath = parser.get("blackListPath");
log.info("blackListInputPath: {}", blackListInputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeFromActionSet(spark, inputPath, outputPath, blackListInputPath);
});
}
private static void removeFromActionSet(SparkSession spark, String inputPath, String outputPath,
String blackListInputPath) {
// read the blacklist
Dataset<String> blackList = readBlackList(spark, blackListInputPath)
.map(
(MapFunction<Row, String>) r -> IdentifierFactory
.idFromPid("50", "doi", ((String) r.getAs("doi")).substring(16), true),
Encoders.STRING());
// read the old actionset and get the relations in the payload
JavaPairRDD<Text, Text> seq = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.sequenceFile(inputPath, Text.class, Text.class);
JavaRDD<Row> rdd = seq
.map(x -> RowFactory.create(x._1().toString(), x._2().toString()));
Dataset<Row> actionSet = spark
.createDataFrame(rdd, KV_SCHEMA)
.withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA))
.select(expr("atomic_action.*"));
Dataset<Relation> relation = actionSet
.map(
(MapFunction<Row, Relation>) r -> MAPPER.readValue((String) r.getAs("payload"), Relation.class),
Encoders.bean(Relation.class));
// select only the relation not matching any pid in the blacklist as source for the relation
Dataset<Relation> relNoSource = relation
.joinWith(blackList, relation.col("source").equalTo(blackList.col("value")), "left")
.filter((FilterFunction<Tuple2<Relation, String>>) t2 -> t2._2() == null)
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class));
// select only the relation not matching any pid in the blacklist as target of the relation
relNoSource
.joinWith(blackList, relNoSource.col("target").equalTo(blackList.col("value")), "left")
.filter((FilterFunction<Tuple2<Relation, String>>) t2 -> t2._2() == null)
.map((MapFunction<Tuple2<Relation, String>, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class))
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
;
}
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
return spark
.read()
.json(inputPath)
.select("doi");
}
}

View File

@ -22,11 +22,9 @@ import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.base.BaseCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.gtr2.Gtr2PublicationsCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.osf.OsfPreprintsCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
@ -60,7 +58,7 @@ public class CollectorWorker extends ReportingJob {
public void collect() throws UnknownCollectorPluginException, CollectorException, IOException { public void collect() throws UnknownCollectorPluginException, CollectorException, IOException {
final String outputPath = this.mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME; final String outputPath = mdStoreVersion.getHdfsPath() + SEQUENCE_FILE_NAME;
log.info("outputPath path is {}", outputPath); log.info("outputPath path is {}", outputPath);
final CollectorPlugin plugin = getCollectorPlugin(); final CollectorPlugin plugin = getCollectorPlugin();
@ -70,36 +68,36 @@ public class CollectorWorker extends ReportingJob {
try (SequenceFile.Writer writer = SequenceFile try (SequenceFile.Writer writer = SequenceFile
.createWriter( .createWriter(
this.fileSystem.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer fileSystem.getConf(),
.keyClass(IntWritable.class), SequenceFile.Writer.file(new Path(outputPath)),
SequenceFile.Writer SequenceFile.Writer.keyClass(IntWritable.class),
.valueClass(Text.class), SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final IntWritable key = new IntWritable(counter.get()); final IntWritable key = new IntWritable(counter.get());
final Text value = new Text(); final Text value = new Text();
plugin plugin
.collect(this.api, this.report) .collect(api, report)
.forEach(content -> { .forEach(
key.set(counter.getAndIncrement()); content -> {
value.set(content); key.set(counter.getAndIncrement());
try { value.set(content);
writer.append(key, value); try {
} catch (final Throwable e) { writer.append(key, value);
throw new RuntimeException(e); } catch (Throwable e) {
} throw new RuntimeException(e);
}); }
} catch (final Throwable e) { });
this.report.put(e.getClass().getName(), e.getMessage()); } catch (Throwable e) {
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e); throw new CollectorException(e);
} finally { } finally {
shutdown(); shutdown();
this.report.ongoing(counter.longValue(), counter.longValue()); report.ongoing(counter.longValue(), counter.longValue());
} }
} }
private void scheduleReport(final AtomicInteger counter) { private void scheduleReport(AtomicInteger counter) {
schedule(new ReporterCallback() { schedule(new ReporterCallback() {
@Override @Override
public Long getCurrent() { public Long getCurrent() {
return counter.longValue(); return counter.longValue();
@ -114,37 +112,33 @@ public class CollectorWorker extends ReportingJob {
private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException { private CollectorPlugin getCollectorPlugin() throws UnknownCollectorPluginException {
switch (CollectorPlugin.NAME.valueOf(this.api.getProtocol())) { switch (CollectorPlugin.NAME.valueOf(api.getProtocol())) {
case oai: case oai:
return new OaiCollectorPlugin(this.clientParams); return new OaiCollectorPlugin(clientParams);
case rest_json2xml: case rest_json2xml:
return new RestCollectorPlugin(this.clientParams); return new RestCollectorPlugin(clientParams);
case file: case file:
return new FileCollectorPlugin(this.fileSystem); return new FileCollectorPlugin(fileSystem);
case fileGzip: case fileGzip:
return new FileGZipCollectorPlugin(this.fileSystem); return new FileGZipCollectorPlugin(fileSystem);
case baseDump: case baseDump:
return new BaseCollectorPlugin(this.fileSystem); return new BaseCollectorPlugin(this.fileSystem);
case gtr2Publications:
return new Gtr2PublicationsCollectorPlugin(this.clientParams);
case osfPreprints:
return new OsfPreprintsCollectorPlugin(this.clientParams);
case other: case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(this.api.getParams().get("other_plugin_type")) .ofNullable(api.getParams().get("other_plugin_type"))
.map(CollectorPlugin.NAME.OTHER_NAME::valueOf) .map(CollectorPlugin.NAME.OTHER_NAME::valueOf)
.orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type")); .orElseThrow(() -> new IllegalArgumentException("invalid other_plugin_type"));
switch (plugin) { switch (plugin) {
case mdstore_mongodb_dump: case mdstore_mongodb_dump:
return new MongoDbDumpCollectorPlugin(this.fileSystem); return new MongoDbDumpCollectorPlugin(fileSystem);
case mdstore_mongodb: case mdstore_mongodb:
return new MDStoreCollectorPlugin(); return new MDStoreCollectorPlugin();
default: default:
throw new UnknownCollectorPluginException("plugin is not managed: " + plugin); throw new UnknownCollectorPluginException("plugin is not managed: " + plugin);
} }
default: default:
throw new UnknownCollectorPluginException("protocol is not managed: " + this.api.getProtocol()); throw new UnknownCollectorPluginException("protocol is not managed: " + api.getProtocol());
} }
} }

View File

@ -20,9 +20,6 @@ public class Author extends ORCIDItem {
private String lastModifiedDate; private String lastModifiedDate;
public Author() {
}
public String getBiography() { public String getBiography() {
return biography; return biography;
} }

View File

@ -11,7 +11,4 @@ public class ORCIDItem {
public void setOrcid(String orcid) { public void setOrcid(String orcid) {
this.orcid = orcid; this.orcid = orcid;
} }
public ORCIDItem() {
}
} }

View File

@ -32,6 +32,4 @@ public class Work extends ORCIDItem {
pids.add(pid); pids.add(pid);
} }
public Work() {
}
} }

View File

@ -11,7 +11,7 @@ public interface CollectorPlugin {
enum NAME { enum NAME {
oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints; oai, other, rest_json2xml, file, fileGzip, baseDump;
public enum OTHER_NAME { public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb mdstore_mongodb_dump, mdstore_mongodb

View File

@ -1,43 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.gtr2;
import java.util.Iterator;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class Gtr2PublicationsCollectorPlugin implements CollectorPlugin {
private final HttpClientParams clientParams;
public Gtr2PublicationsCollectorPlugin(final HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
final String baseUrl = api.getBaseUrl();
final String startPage = api.getParams().get("startPage");
final String endPage = api.getParams().get("endPage");
final String fromDate = api.getParams().get("fromDate");
if ((fromDate != null) && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) {
throw new CollectorException("Invalid date (YYYY-MM-DD): " + fromDate);
}
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseUrl, fromDate, startPage, endPage,
this.clientParams);
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
return StreamSupport.stream(spliterator, false);
}
}

View File

@ -1,215 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.gtr2;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class Gtr2PublicationsIterator implements Iterator<String> {
public static final int PAGE_SIZE = 20;
private static final Logger log = LoggerFactory.getLogger(Gtr2PublicationsIterator.class);
private final HttpConnector2 connector;
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
private static final int MAX_ATTEMPTS = 10;
private final String baseUrl;
private int currPage;
private int endPage;
private boolean incremental = false;
private DateTime fromDate;
private final Map<String, String> cache = new HashMap<>();
private final Queue<String> queue = new LinkedList<>();
private String nextElement;
public Gtr2PublicationsIterator(final String baseUrl, final String fromDate, final String startPage,
final String endPage,
final HttpClientParams clientParams)
throws CollectorException {
this.baseUrl = baseUrl;
this.currPage = NumberUtils.toInt(startPage, 1);
this.endPage = NumberUtils.toInt(endPage, Integer.MAX_VALUE);
this.incremental = StringUtils.isNotBlank(fromDate);
this.connector = new HttpConnector2(clientParams);
if (this.incremental) {
this.fromDate = parseDate(fromDate);
}
prepareNextElement();
}
@Override
public boolean hasNext() {
return this.nextElement != null;
}
@Override
public String next() {
try {
return this.nextElement;
} finally {
prepareNextElement();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private void prepareNextElement() {
while ((this.currPage <= this.endPage) && this.queue.isEmpty()) {
log.debug("FETCHING PAGE + " + this.currPage + "/" + this.endPage);
this.queue.addAll(fetchPage(this.currPage++));
}
this.nextElement = this.queue.poll();
}
private List<String> fetchPage(final int pageNumber) {
final List<String> res = new ArrayList<>();
try {
final Document doc = loadURL(cleanURL(this.baseUrl + "/outcomes/publications?p=" + pageNumber), 0);
if (this.endPage == Integer.MAX_VALUE) {
this.endPage = NumberUtils.toInt(doc.valueOf("/*/@*[local-name() = 'totalPages']"));
}
for (final Object po : doc.selectNodes("//*[local-name() = 'publication']")) {
final Element mainEntity = (Element) ((Element) po).detach();
if (filterIncremental(mainEntity)) {
res.add(expandMainEntity(mainEntity));
} else {
log.debug("Skipped entity");
}
}
} catch (final Throwable e) {
log.error("Exception fetching page " + pageNumber, e);
throw new RuntimeException("Exception fetching page " + pageNumber, e);
}
return res;
}
private void addLinkedEntities(final Element master, final String relType, final Element newRoot,
final Function<Document, Element> mapper) {
for (final Object o : master.selectNodes(".//*[local-name()='link']")) {
final String rel = ((Element) o).valueOf("@*[local-name()='rel']");
final String href = ((Element) o).valueOf("@*[local-name()='href']");
if (relType.equals(rel) && StringUtils.isNotBlank(href)) {
final String cacheKey = relType + "#" + href;
if (this.cache.containsKey(cacheKey)) {
try {
log.debug(" * from cache (" + relType + "): " + href);
newRoot.add(DocumentHelper.parseText(this.cache.get(cacheKey)).getRootElement());
} catch (final DocumentException e) {
log.error("Error retrieving cache element: " + cacheKey, e);
throw new RuntimeException("Error retrieving cache element: " + cacheKey, e);
}
} else {
final Document doc = loadURL(cleanURL(href), 0);
final Element elem = mapper.apply(doc);
newRoot.add(elem);
this.cache.put(cacheKey, elem.asXML());
}
}
}
}
private boolean filterIncremental(final Element e) {
if (!this.incremental || isAfter(e.valueOf("@*[local-name() = 'created']"), this.fromDate)
|| isAfter(e.valueOf("@*[local-name() = 'updated']"), this.fromDate)) {
return true;
}
return false;
}
private String expandMainEntity(final Element mainEntity) {
final Element newRoot = DocumentHelper.createElement("doc");
newRoot.add(mainEntity);
addLinkedEntities(mainEntity, "PROJECT", newRoot, this::asProjectElement);
return DocumentHelper.createDocument(newRoot).asXML();
}
private Element asProjectElement(final Document doc) {
final Element newOrg = DocumentHelper.createElement("project");
newOrg.addElement("id").setText(doc.valueOf("/*/@*[local-name()='id']"));
newOrg
.addElement("code")
.setText(doc.valueOf("//*[local-name()='identifier' and @*[local-name()='type'] = 'RCUK']"));
newOrg.addElement("title").setText(doc.valueOf("//*[local-name()='title']"));
return newOrg;
}
private static String cleanURL(final String url) {
String cleaned = url;
if (cleaned.contains("gtr.gtr")) {
cleaned = cleaned.replace("gtr.gtr", "gtr");
}
if (cleaned.startsWith("http://")) {
cleaned = cleaned.replaceFirst("http://", "https://");
}
return cleaned;
}
private Document loadURL(final String cleanUrl, final int attempt) {
try {
log.debug(" * Downloading Url: " + cleanUrl);
final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes("UTF-8");
return DocumentHelper.parseText(new String(bytes));
} catch (final Throwable e) {
log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e);
if (attempt >= MAX_ATTEMPTS) {
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
}
try {
Thread.sleep(60000); // I wait for a minute
} catch (final InterruptedException e1) {
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
}
return loadURL(cleanUrl, attempt + 1);
}
}
private DateTime parseDate(final String s) {
return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
}
private boolean isAfter(final String d, final DateTime fromDate) {
return StringUtils.isNotBlank(d) && parseDate(d).isAfter(fromDate);
}
}

View File

@ -1,52 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.osf;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class OsfPreprintsCollectorPlugin implements CollectorPlugin {
public static final int PAGE_SIZE_VALUE_DEFAULT = 100;
private final HttpClientParams clientParams;
public OsfPreprintsCollectorPlugin(final HttpClientParams clientParams) {
this.clientParams = clientParams;
}
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
final String baseUrl = api.getBaseUrl();
final int pageSize = Optional
.ofNullable(api.getParams().get("pageSize"))
.filter(StringUtils::isNotBlank)
.map(s -> NumberUtils.toInt(s, PAGE_SIZE_VALUE_DEFAULT))
.orElse(PAGE_SIZE_VALUE_DEFAULT);
if (StringUtils.isBlank(baseUrl)) {
throw new CollectorException("Param 'baseUrl' is null or empty");
}
final OsfPreprintsIterator it = new OsfPreprintsIterator(baseUrl, pageSize, getClientParams());
return StreamSupport
.stream(Spliterators.spliteratorUnknownSize(it, Spliterator.ORDERED), false);
}
public HttpClientParams getClientParams() {
return this.clientParams;
}
}

View File

@ -1,151 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.osf;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class OsfPreprintsIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(OsfPreprintsIterator.class);
private static final int MAX_ATTEMPTS = 5;
private final HttpClientParams clientParams;
private final String baseUrl;
private final int pageSize;
private String currentUrl;
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
public OsfPreprintsIterator(
final String baseUrl,
final int pageSize,
final HttpClientParams clientParams) {
this.clientParams = clientParams;
this.baseUrl = baseUrl;
this.pageSize = pageSize;
initQueue();
}
private void initQueue() {
this.currentUrl = this.baseUrl + "?filter:is_published:d=true&format=json&page[size]=" + this.pageSize;
log.info("REST calls starting with {}", this.currentUrl);
}
@Override
public boolean hasNext() {
synchronized (this.recordQueue) {
while (this.recordQueue.isEmpty() && StringUtils.isNotBlank(this.currentUrl)
&& this.currentUrl.startsWith("http")) {
try {
this.currentUrl = downloadPage(this.currentUrl);
} catch (final CollectorException e) {
log.debug("CollectorPlugin.next()-Exception: {}", e);
throw new RuntimeException(e);
}
}
if (!this.recordQueue.isEmpty()) {
return true;
}
return false;
}
}
@Override
public String next() {
synchronized (this.recordQueue) {
return this.recordQueue.poll();
}
}
private String downloadPage(final String url) throws CollectorException {
final Document doc = downloadUrl(url, 0);
for (final Object o : doc.selectNodes("/*/data")) {
final Element n = (Element) ((Element) o).detach();
final Element group = DocumentHelper.createElement("group");
group.addAttribute("id", n.valueOf("./id"));
group.addElement("preprint").add(n);
for (final Object o1 : n.selectNodes(".//contributors//href")) {
final String href = ((Node) o1).getText();
if (StringUtils.isNotBlank(href) && href.startsWith("http")) {
final Document doc1 = downloadUrl(href, 0);
group.addElement("contributors").add(doc1.getRootElement().detach());
}
}
for (final Object o1 : n.selectNodes(".//primary_file//href")) {
final String href = ((Node) o1).getText();
if (StringUtils.isNotBlank(href) && href.startsWith("http")) {
final Document doc1 = downloadUrl(href, 0);
group.addElement("primary_file").add(doc1.getRootElement().detach());
}
}
this.recordQueue.add(DocumentHelper.createDocument(group).asXML());
}
return doc.valueOf("/*/links/next");
}
private Document downloadUrl(final String url, final int attempt) throws CollectorException {
if (attempt > MAX_ATTEMPTS) {
throw new CollectorException("Max Number of attempts reached, url:" + url);
}
if (attempt > 0) {
final int delay = (attempt * 5000);
log.debug("Attempt {} with delay {}", attempt, delay);
try {
Thread.sleep(delay);
} catch (final InterruptedException e) {
new CollectorException(e);
}
}
try {
log.info("requesting URL [{}]", url);
final HttpConnector2 connector = new HttpConnector2(this.clientParams);
final String json = connector.getInputSource(url);
final String xml = JsonUtils.convertToXML(json);
return DocumentHelper.parseText(xml);
} catch (final Throwable e) {
log.warn(e.getMessage(), e);
if ((e instanceof CollectorException) && e.getMessage().contains("401")) {
final Element root = DocumentHelper.createElement("error_401_authorization_required");
return DocumentHelper.createDocument(root);
}
return downloadUrl(url, attempt + 1);
}
}
}

View File

@ -1,76 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiCollectorPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(ResearchFiCollectorPlugin.class);
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
throws CollectorException {
final String authUrl = api.getParams().get("auth_url");
final String clientId = api.getParams().get("auth_client_id");
final String clientSecret = api.getParams().get("auth_client_secret");
final String authToken = authenticate(authUrl, clientId, clientSecret);
final Iterator<String> iter = new ResearchFiIterator(api.getBaseUrl(), authToken);
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED), false);
}
private String authenticate(final String authUrl, final String clientId, final String clientSecret)
throws CollectorException {
try (final CloseableHttpClient client = HttpClients.createDefault()) {
final HttpPost req = new HttpPost(authUrl);
final List<NameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("grant_type", "client_credentials"));
params.add(new BasicNameValuePair("client_id", clientId));
params.add(new BasicNameValuePair("client_secret", clientSecret));
req.setEntity(new UrlEncodedFormEntity(params, "UTF-8"));
try (final CloseableHttpResponse response = client.execute(req)) {
final String content = IOUtils.toString(response.getEntity().getContent());
final JSONObject obj = new JSONObject(content);
final String token = obj.getString("access_token");
if (StringUtils.isNotBlank(token)) {
return token;
}
}
} catch (final Throwable e) {
log.warn("Error obtaining access token", e);
throw new CollectorException("Error obtaining access token", e);
}
throw new CollectorException("Access token is missing");
}
}

View File

@ -1,117 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.json.JSONArray;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(ResearchFiIterator.class);
private static final int PAGE_SIZE = 100;
private final String baseUrl;
private final String authToken;
private int currPage;
private int nPages;
private final Queue<String> queue = new PriorityBlockingQueue<>();
public ResearchFiIterator(final String baseUrl, final String authToken) {
this.baseUrl = baseUrl;
this.authToken = authToken;
this.currPage = 0;
this.nPages = 0;
}
private void verifyStarted() {
if (this.currPage == 0) {
try {
nextCall();
} catch (final CollectorException e) {
throw new IllegalStateException(e);
}
}
}
@Override
public boolean hasNext() {
synchronized (this.queue) {
verifyStarted();
return !this.queue.isEmpty();
}
}
@Override
public String next() {
synchronized (this.queue) {
verifyStarted();
final String res = this.queue.poll();
while (this.queue.isEmpty() && (this.currPage < this.nPages)) {
try {
nextCall();
} catch (final CollectorException e) {
throw new IllegalStateException(e);
}
}
return res;
}
}
private void nextCall() throws CollectorException {
this.currPage += 1;
final String url;
if (!this.baseUrl.contains("?")) {
url = String.format("%s?PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
} else if (!this.baseUrl.contains("PageSize=")) {
url = String.format("%s&PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
} else {
url = String.format("%s&PageNumber=%d", this.baseUrl, this.currPage);
}
log.info("Calling url: " + url);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
final HttpGet req = new HttpGet(url);
req.addHeader("Authorization", "Bearer " + this.authToken);
try (final CloseableHttpResponse response = client.execute(req)) {
for (final Header header : response.getAllHeaders()) {
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
if ("x-page-count".equals(header.getName())) {
final int totalPages = NumberUtils.toInt(header.getValue());
if (this.nPages != totalPages) {
this.nPages = NumberUtils.toInt(header.getValue());
log.info("Total pages: " + totalPages);
}
}
}
final String content = IOUtils.toString(response.getEntity().getContent());
final JSONArray jsonArray = new JSONArray(content);
jsonArray.forEach(obj -> this.queue.add(JsonUtils.convertToXML(obj.toString())));
}
} catch (final Throwable e) {
log.warn("Error calling url: " + url, e);
throw new CollectorException("Error calling url: " + url, e);
}
}
}

View File

@ -8,10 +8,7 @@ import java.io.StringWriter;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction; import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import javax.xml.stream.XMLEventFactory; import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLEventReader;
@ -22,7 +19,6 @@ import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement; import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent; import javax.xml.stream.events.XMLEvent;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -62,23 +58,13 @@ public class XMLIterator implements Iterator<String> {
private String element; private String element;
private List<String> elements;
private InputStream inputStream; private InputStream inputStream;
public XMLIterator(final String element, final InputStream inputStream) { public XMLIterator(final String element, final InputStream inputStream) {
super(); super();
this.element = element; this.element = element;
if (element.contains(",")) {
elements = Arrays
.stream(element.split(","))
.filter(StringUtils::isNoneBlank)
.map(String::toLowerCase)
.collect(Collectors.toList());
}
this.inputStream = inputStream; this.inputStream = inputStream;
this.parser = getParser(); this.parser = getParser();
try { try {
this.current = findElement(parser); this.current = findElement(parser);
} catch (XMLStreamException e) { } catch (XMLStreamException e) {
@ -127,7 +113,7 @@ public class XMLIterator implements Iterator<String> {
final XMLEvent event = parser.nextEvent(); final XMLEvent event = parser.nextEvent();
// TODO: replace with depth tracking instead of close tag tracking. // TODO: replace with depth tracking instead of close tag tracking.
if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) { if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
writer.add(event); writer.add(event);
break; break;
} }
@ -156,16 +142,18 @@ public class XMLIterator implements Iterator<String> {
XMLEvent peek = parser.peek(); XMLEvent peek = parser.peek();
if (peek != null && peek.isStartElement()) { if (peek != null && peek.isStartElement()) {
String name = peek.asStartElement().getName().getLocalPart(); String name = peek.asStartElement().getName().getLocalPart();
if (isCheckTag(name)) if (element.equals(name)) {
return peek; return peek;
}
} }
while (parser.hasNext()) { while (parser.hasNext()) {
XMLEvent event = parser.nextEvent(); final XMLEvent event = parser.nextEvent();
if (event != null && event.isStartElement()) { if (event != null && event.isStartElement()) {
String name = event.asStartElement().getName().getLocalPart(); String name = event.asStartElement().getName().getLocalPart();
if (isCheckTag(name)) if (element.equals(name)) {
return event; return event;
}
} }
} }
return null; return null;
@ -173,31 +161,12 @@ public class XMLIterator implements Iterator<String> {
private XMLEventReader getParser() { private XMLEventReader getParser() {
try { try {
XMLInputFactory xif = inputFactory.get(); return inputFactory.get().createXMLEventReader(sanitize(inputStream));
xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
return xif.createXMLEventReader(sanitize(inputStream));
} catch (XMLStreamException e) { } catch (XMLStreamException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
private boolean isCheckTag(final String tagName) {
if (elements != null) {
final String found = elements
.stream()
.filter(e -> e.equalsIgnoreCase(tagName))
.findFirst()
.orElse(null);
if (found != null)
return true;
} else {
if (element.equalsIgnoreCase(tagName)) {
return true;
}
}
return false;
}
private Reader sanitize(final InputStream in) { private Reader sanitize(final InputStream in) {
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder(); final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);

View File

@ -30,13 +30,13 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "wip", "paramName": "wip",
"paramLongName": "webCrawlInputPath", "paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl", "paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "pub", "paramName": "pip",
"paramLongName": "publisherInputPath", "paramLongName": "publisherInputPath",
"paramDescription": "the path to get the input data from publishers", "paramDescription": "the path to get the input data from publishers",
"paramRequired": true "paramRequired": true

View File

@ -24,19 +24,12 @@
"paramLongName": "outputPath", "paramLongName": "outputPath",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": true "paramRequired": true
}, }, {
{ "paramName": "nn",
"paramName": "nn", "paramLongName": "hdfsNameNode",
"paramLongName": "hdfsNameNode", "paramDescription": "the hdfs name node",
"paramDescription": "the hdfs name node", "paramRequired": true
"paramRequired": true }
},
{
"paramName": "bp",
"paramLongName": "backupPath",
"paramDescription": "the hdfs path to move the OC data after the extraction",
"paramRequired": true
}
] ]

View File

@ -129,7 +129,6 @@
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/Extracted</arg> <arg>--inputPath</arg><arg>${inputPath}/Extracted</arg>
<arg>--outputPath</arg><arg>${inputPath}/JSON</arg> <arg>--outputPath</arg><arg>${inputPath}/JSON</arg>
<arg>--backupPath</arg><arg>${inputPath}/backup</arg>
<arg>--delimiter</arg><arg>${delimiter}</arg> <arg>--delimiter</arg><arg>${delimiter}</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg> <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
</spark> </spark>

View File

@ -16,11 +16,10 @@
"paramLongName": "isSparkSessionManged", "paramLongName": "isSparkSessionManged",
"paramDescription": "the hdfs name node", "paramDescription": "the hdfs name node",
"paramRequired": false "paramRequired": false
}, },{
{ "paramName": "nn",
"paramName": "nn", "paramLongName": "nameNode",
"paramLongName": "nameNode", "paramDescription": "the hdfs name node",
"paramDescription": "the hdfs name node", "paramRequired": true
"paramRequired": true }
}
] ]

View File

@ -1,25 +0,0 @@
[
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the zipped opencitations file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the working path",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the hdfs name node",
"paramRequired": false
}, {
"paramName": "wd",
"paramLongName": "workingDir",
"paramDescription": "the hdfs name node",
"paramRequired": false
}
]

View File

@ -1,2 +0,0 @@
inputPath=/data/orcid_2023/tables/
outputPath=/user/miriam.baglioni/peopleAS

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,111 +0,0 @@
<workflow-app name="PersonEntity" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>inputPath</name>
<description>inputPath</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="atomicactions"/>
<error to="Kill"/>
</action>
<action name="atomicactions">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the ActionSet for Person entity and relevant relations</name>
<class>eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=4
--executor-memory=4G
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=5G
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=15000
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,20 +0,0 @@
[
{
"paramName": "sp",
"paramLongName": "sourcePath",
"paramDescription": "the zipped opencitations file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the working path",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the hdfs name node",
"paramRequired": false
}
]

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,125 +0,0 @@
<workflow-app name="SDG no doi" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sdgPath</name>
<description>the input path of the resources to be extended</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="prepareSDG"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="prepareSDG">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the results from FOS</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareSDGSparkJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sdgPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
<arg>--distributeDoi</arg><arg>false</arg>
</spark>
<ok to="produceActionSet"/>
<error to="Kill"/>
</action>
<action name="produceActionSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Save the action set grouping results with the same id</name>
<class>eu.dnetlib.dhp.actionmanager.sdgnodoi.CreateActionSetSparkJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/prepared/sdg</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -24,7 +24,7 @@
<decision name="resume_from"> <decision name="resume_from">
<switch> <switch>
<case to="reset_workingDir">${wf:conf('resumeFrom') eq 'DownloadDump'}</case> <case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed --> <default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
</switch> </switch>
</decision> </decision>
@ -33,14 +33,6 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="download"/>
<error to="Kill"/>
</action>
<action name="download"> <action name="download">
<shell xmlns="uri:oozie:shell-action:0.2"> <shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>

View File

@ -1,11 +1,3 @@
#PROPERTIES TO CREATE THE ACTION SET sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
#sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/ outputPath=/tmp/miriam/webcrawlComplete/
#outputPath=/tmp/miriam/webcrawlComplete/ blackListPath=/user/miriam.baglioni/openalex-blackList
#blackListPath=/user/miriam.baglioni/openalex-blackList
#resumeFrom=create
#PROPERTIES TO REMOVE FROM THE ACTION SET
sourcePath=/var/lib/dnet/actionManager_PROD/webcrawl/rawset_28247629-468b-478e-9a42-bc540877125d_1718121542061/
outputPath=/tmp/miriam/webcrawlRemoved/
blackListPath=/user/miriam.baglioni/oalexBlackListNormalized
resumeFrom=remove

View File

@ -20,19 +20,12 @@
</configuration> </configuration>
</global> </global>
<start to="resumeFrom"/> <start to="create_actionset"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<decision name="resumeFrom">
<switch>
<case to="create_actionset">${wf:conf('resumeFrom') eq 'create'}</case>
<default to="remove_from_actionset"/>
</switch>
</decision>
<action name="create_actionset"> <action name="create_actionset">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -57,30 +50,5 @@
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="remove_from_actionset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Removes some relations found to be wrong from the AS</name>
<class>eu.dnetlib.dhp.actionmanager.webcrawl.RemoveRelationFromActionSet</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -1,54 +1,44 @@
<RESOURCE_PROFILE> <RESOURCE_PROFILE>
<HEADER> <HEADER>
<RESOURCE_IDENTIFIER <RESOURCE_IDENTIFIER value="2ad0cdd9-c96c-484c-8b0e-ed56d86891fe_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU=" />
value="2ad0cdd9-c96c-484c-8b0e-ed56d86891fe_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="/> <RESOURCE_TYPE value="TransformationRuleDSResourceType" />
<RESOURCE_TYPE value="TransformationRuleDSResourceType"/> <RESOURCE_KIND value="TransformationRuleDSResources" />
<RESOURCE_KIND value="TransformationRuleDSResources"/> <RESOURCE_URI value="" />
<RESOURCE_URI value=""/> <DATE_OF_CREATION value="2024-03-05T11:23:00+00:00" />
<DATE_OF_CREATION value="2024-03-05T11:23:00+00:00"/> </HEADER>
</HEADER> <BODY>
<BODY> <CONFIGURATION>
<CONFIGURATION> <SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc" />
<SOURCE_METADATA_FORMAT interpretation="cleaned" layout="store" name="dc"/> <SINK_METADATA_FORMAT name="odf_hbase" />
<SINK_METADATA_FORMAT name="odf_hbase"/> <IMPORTED />
<IMPORTED/> <SCRIPT>
<SCRIPT> <TITLE>xslt_base2odf_hadoop</TITLE>
<TITLE>xslt_base2odf_hadoop</TITLE> <CODE>
<CODE> <xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO" xmlns:base_dc="http://oai.base-search.net/base_dc/"
<xsl:stylesheet xmlns:oaire="http://namespace.openaire.eu/schema/oaire/" xmlns:datacite="http://datacite.org/schema/kernel-4" xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dateCleaner="http://eu/dnetlib/transform/dateISO" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:vocabulary="http://eu/dnetlib/transform/clean" xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:datacite="http://datacite.org/schema/kernel-4" exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
xmlns:dr="http://www.driver-repository.eu/namespace/dr" <xsl:param name="varOfficialName" />
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <xsl:param name="varDataSourceId" />
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" <xsl:param name="varFP7" select="'corda_______::'" />
xmlns:vocabulary="http://eu/dnetlib/transform/clean" <xsl:param name="varH2020" select="'corda__h2020::'" />
xmlns:oaf="http://namespace.openaire.eu/oaf" <xsl:param name="repoCode" select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')" />
xmlns:oai="http://www.openarchives.org/OAI/2.0/" <xsl:param name="index" select="0" />
xmlns:dri="http://www.driver-repository.eu/namespace/dri" <xsl:param name="transDate" select="current-dateTime()" />
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dc="http://purl.org/dc/elements/1.1/"
exclude-result-prefixes="xsl vocabulary dateCleaner base_dc" version="2.0">
<xsl:param name="varOfficialName"/>
<xsl:param name="varDataSourceId"/>
<xsl:param name="varFP7" select="'corda_______::'"/>
<xsl:param name="varH2020" select="'corda__h2020::'"/>
<xsl:param name="repoCode"
select="substring-before(//*[local-name() = 'header']/*[local-name()='recordIdentifier'], ':')"/>
<xsl:param name="index" select="0"/>
<xsl:param name="transDate" select="current-dateTime()"/>
<xsl:template name="terminate"> <xsl:template name="terminate">
<xsl:message terminate="yes"> record is not compliant, transformation is <xsl:message terminate="yes">
interrupted. </xsl:message> record is not compliant, transformation is interrupted.
</xsl:template> </xsl:message>
</xsl:template>
<xsl:template match="/"> <xsl:template match="/">
<record> <record>
<xsl:apply-templates select="//*[local-name() = 'header']"/> <xsl:apply-templates select="//*[local-name() = 'header']" />
<!-- NOT USED <!-- NOT USED
base_dc:global_id (I used oai:identifier) base_dc:global_id (I used oai:identifier)
base_dc:collection/text() base_dc:collection/text()
base_dc:continent base_dc:continent
@ -61,481 +51,422 @@
base_dc:link (I used dc:identifier) base_dc:link (I used dc:identifier)
--> -->
<metadata> <metadata>
<datacite:resource> <datacite:resource>
<xsl:for-each select="//base_dc:doi"> <xsl:for-each select="//base_dc:doi">
<datacite:identifier identifierType="DOI"> <datacite:identifier identifierType="DOI">
<xsl:value-of select="."/> <xsl:value-of select="." />
</datacite:identifier> </datacite:identifier>
</xsl:for-each> </xsl:for-each>
<datacite:alternateIdentifiers> <datacite:alternateIdentifiers>
<xsl:for-each <xsl:for-each
select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])"> select="distinct-values(//dc:identifier[starts-with(., 'http') and (not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
<datacite:identifier alternateIdentifierType="url"> <datacite:identifier alternateIdentifierType="url">
<xsl:value-of select="."/> <xsl:value-of select="." />
</datacite:identifier> </datacite:identifier>
</xsl:for-each> </xsl:for-each>
<xsl:for-each <xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))"> <datacite:identifier alternateIdentifierType="handle">
<datacite:identifier <xsl:value-of select="." />
alternateIdentifierType="handle"> </datacite:identifier>
<xsl:value-of select="."/> </xsl:for-each>
</datacite:identifier>
</xsl:for-each>
<xsl:for-each <xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])"> <datacite:identifier alternateIdentifierType='urn'>
<datacite:identifier alternateIdentifierType="urn"> <xsl:value-of select="." />
<xsl:value-of select="."/> </datacite:identifier>
</datacite:identifier> </xsl:for-each>
</xsl:for-each>
<datacite:identifier <datacite:identifier alternateIdentifierType="oai-original">
alternateIdentifierType="oai-original"> <xsl:value-of
<xsl:value-of select="//oai:header/oai:identifier"/> select="//oai:header/oai:identifier" />
</datacite:identifier> </datacite:identifier>
</datacite:alternateIdentifiers>
</datacite:alternateIdentifiers> <datacite:relatedIdentifiers />
<datacite:relatedIdentifiers/>
<xsl:for-each select="//base_dc:typenorm"> <xsl:for-each select="//base_dc:typenorm">
<datacite:resourceType> <datacite:resourceType><xsl:value-of select="vocabulary:clean(., 'base:normalized_types')" /></datacite:resourceType>
<xsl:value-of </xsl:for-each>
select="vocabulary:clean(., 'base:normalized_types')"
/>
</datacite:resourceType>
</xsl:for-each>
<datacite:titles> <datacite:titles>
<xsl:for-each select="//dc:title"> <xsl:for-each select="//dc:title">
<datacite:title> <datacite:title>
<xsl:value-of select="normalize-space(.)"/> <xsl:value-of select="normalize-space(.)" />
</datacite:title> </datacite:title>
</xsl:for-each> </xsl:for-each>
</datacite:titles> </datacite:titles>
<datacite:creators> <datacite:creators>
<xsl:for-each select="//dc:creator"> <xsl:for-each select="//dc:creator">
<xsl:variable name="author" <xsl:variable name="author" select="normalize-space(.)" />
select="normalize-space(.)"/> <datacite:creator>
<datacite:creator> <datacite:creatorName>
<datacite:creatorName> <xsl:value-of select="$author" />
<xsl:value-of select="$author"/> </datacite:creatorName>
</datacite:creatorName> <xsl:for-each select="//base_dc:authod_id[normalize-space(./base_dc:creator_name) = $author]/base_dc:creator_id ">
<xsl:for-each <xsl:if test="contains(.,'https://orcid.org/')">
select="//base_dc:authod_id[normalize-space(./base_dc:creator_name) = $author]/base_dc:creator_id "> <nameIdentifier schemeURI="https://orcid.org/" nameIdentifierScheme="ORCID">
<xsl:if test="contains(.,'https://orcid.org/')"> <xsl:value-of select="substring-after(., 'https://orcid.org/')" />
<nameIdentifier schemeURI="https://orcid.org/" </nameIdentifier>
nameIdentifierScheme="ORCID"> </xsl:if>
<xsl:value-of </xsl:for-each>
select="substring-after(., 'https://orcid.org/')" </datacite:creator>
/> </xsl:for-each>
</nameIdentifier> </datacite:creators>
</xsl:if>
</xsl:for-each>
</datacite:creator>
</xsl:for-each>
</datacite:creators>
<datacite:contributors> <datacite:contributors>
<xsl:for-each select="//dc:contributor"> <xsl:for-each select="//dc:contributor">
<datacite:contributor> <datacite:contributor>
<datacite:contributorName> <datacite:contributorName>
<xsl:value-of select="normalize-space(.)"/> <xsl:value-of select="normalize-space(.)" />
</datacite:contributorName> </datacite:contributorName>
</datacite:contributor> </datacite:contributor>
</xsl:for-each> </xsl:for-each>
</datacite:contributors> </datacite:contributors>
<datacite:descriptions> <datacite:descriptions>
<xsl:for-each select="//dc:description"> <xsl:for-each select="//dc:description">
<datacite:description descriptionType="Abstract"> <datacite:description descriptionType="Abstract">
<xsl:value-of select="normalize-space(.)"/> <xsl:value-of select="normalize-space(.)" />
</datacite:description> </datacite:description>
</xsl:for-each> </xsl:for-each>
</datacite:descriptions> </datacite:descriptions>
<datacite:subjects> <datacite:subjects>
<xsl:for-each select="//dc:subject"> <xsl:for-each select="//dc:subject">
<datacite:subject> <datacite:subject>
<xsl:value-of select="normalize-space(.)"/> <xsl:value-of select="normalize-space(.)" />
</datacite:subject> </datacite:subject>
</xsl:for-each> </xsl:for-each>
<xsl:for-each select="//base_dc:classcode|//base_dc:autoclasscode">
<datacite:subject subjectScheme="{@type}" classificationCode="{normalize-space(.)}">
<!-- TODO the value should be obtained by the Code -->
<xsl:value-of select="normalize-space(.)" />
</datacite:subject>
</xsl:for-each>
</datacite:subjects>
<xsl:for-each select="//dc:publisher">
<datacite:publisher>
<xsl:value-of select="normalize-space(.)" />
</datacite:publisher>
</xsl:for-each>
<xsl:for-each select="//base_dc:year">
<datacite:publicationYear>
<xsl:value-of select="normalize-space(.)" />
</datacite:publicationYear>
</xsl:for-each>
<datacite:formats>
<xsl:for-each select="//dc:format">
<datacite:format>
<xsl:value-of select="normalize-space(.)" />
</datacite:format>
</xsl:for-each>
</datacite:formats>
<datacite:language>
<xsl:value-of select="vocabulary:clean( //base_dc:lang, 'dnet:languages')" />
</datacite:language>
<xsl:for-each <oaf:accessrights>
select="//base_dc:classcode|//base_dc:autoclasscode"> <xsl:if test="//base_dc:oa[.='0']">
<datacite:subject subjectScheme="{@type}" <datacite:rights rightsURI="http://purl.org/coar/access_right/c_16ec">restricted access</datacite:rights>
classificationCode="{normalize-space(.)}"> </xsl:if>
<!-- TODO the value should be obtained by the Code --> <xsl:if test="//base_dc:oa[.='1']">
<xsl:value-of select="normalize-space(.)"/> <datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
</datacite:subject> </xsl:if>
</xsl:for-each> <xsl:for-each select="//dc:rights|//base_dc:rightsnorm">
</datacite:subjects> <datacite:rights><xsl:value-of select="vocabulary:clean(., 'dnet:access_modes')" /></datacite:rights>
</xsl:for-each>
</oaf:accessrights>
<xsl:for-each select="//dc:publisher"> </datacite:resource>
<datacite:publisher>
<xsl:value-of select="normalize-space(.)"/>
</datacite:publisher>
</xsl:for-each>
<xsl:for-each select="//base_dc:year"> <xsl:for-each select="//dc:relation">
<datacite:publicationYear> <xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')">
<xsl:value-of select="normalize-space(.)"/> <oaf:projectid>
</datacite:publicationYear> <xsl:value-of select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
</xsl:for-each> </oaf:projectid>
</xsl:if>
<xsl:if test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
<oaf:projectid>
<xsl:value-of select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" />
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<datacite:formats> <xsl:choose>
<xsl:for-each select="//dc:format"> <!-- I used an inline mapping because the field typenorm could be repeated and I have to specify a list of priority -->
<datacite:format>
<xsl:value-of select="normalize-space(.)"/> <!-- Book part -->
</datacite:format> <xsl:when test="//base_dc:typenorm = '111'">
</xsl:for-each> <dr:CobjCategory type="publication">0013</dr:CobjCategory>
</datacite:formats> </xsl:when>
<!-- Book -->
<xsl:when test="//base_dc:typenorm = '11'">
<dr:CobjCategory type="publication">0002</dr:CobjCategory>
</xsl:when>
<!-- Article contribution -->
<xsl:when test="//base_dc:typenorm = '121'">
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
</xsl:when>
<!-- Journal/Newspaper -->
<xsl:when test="//base_dc:typenorm = '12'">
<dr:CobjCategory type="publication">0043</dr:CobjCategory>
</xsl:when>
<!-- Report -->
<xsl:when test="//base_dc:typenorm = '14'">
<dr:CobjCategory type="publication">0017</dr:CobjCategory>
</xsl:when>
<!-- Review -->
<xsl:when test="//base_dc:typenorm = '15'">
<dr:CobjCategory type="publication">0015</dr:CobjCategory>
</xsl:when>
<!-- Lecture -->
<xsl:when test="//base_dc:typenorm = '17'">
<dr:CobjCategory type="publication">0010</dr:CobjCategory>
</xsl:when>
<!-- Bachelor's thesis -->
<xsl:when test="//base_dc:typenorm = '181'">
<dr:CobjCategory type="publication">0008</dr:CobjCategory>
</xsl:when>
<!-- Master's thesis -->
<xsl:when test="//base_dc:typenorm = '182'">
<dr:CobjCategory type="publication">0007</dr:CobjCategory>
</xsl:when>
<!-- Doctoral and postdoctoral thesis -->
<xsl:when test="//base_dc:typenorm = '183'">
<dr:CobjCategory type="publication">0006</dr:CobjCategory>
</xsl:when>
<!-- Thesis -->
<xsl:when test="//base_dc:typenorm = '18'">
<dr:CobjCategory type="publication">0044</dr:CobjCategory>
</xsl:when>
<!-- Patent -->
<xsl:when test="//base_dc:typenorm = '1A'">
<dr:CobjCategory type="publication">0019</dr:CobjCategory>
</xsl:when>
<!-- Text -->
<xsl:when test="//base_dc:typenorm = '1'">
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
</xsl:when>
<!-- Software -->
<xsl:when test="//base_dc:typenorm = '6'">
<dr:CobjCategory type="software">0029</dr:CobjCategory>
</xsl:when>
<!-- Dataset -->
<xsl:when test="//base_dc:typenorm = '7'">
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
</xsl:when>
<!-- Still image -->
<xsl:when test="//base_dc:typenorm = '51'">
<dr:CobjCategory type="other">0025</dr:CobjCategory>
</xsl:when>
<!-- Moving image/Video -->
<xsl:when test="//base_dc:typenorm = '52'">
<dr:CobjCategory type="other">0024</dr:CobjCategory>
</xsl:when>
<!-- Image/Video -->
<xsl:when test="//base_dc:typenorm = '5'">
<dr:CobjCategory type="other">0033</dr:CobjCategory>
</xsl:when>
<datacite:language> <!-- Audio -->
<xsl:value-of <xsl:when test="//base_dc:typenorm = '4'">
select="vocabulary:clean( //base_dc:lang, 'dnet:languages')" <dr:CobjCategory type="other">0030</dr:CobjCategory>
/> </xsl:when>
</datacite:language>
<!-- Musical notation -->
<xsl:when test="//base_dc:typenorm = '2'">
<dr:CobjCategory type="other">0020</dr:CobjCategory>
</xsl:when>
<!-- Map -->
<xsl:when test="//base_dc:typenorm = '3'">
<dr:CobjCategory type="other">0020</dr:CobjCategory>
</xsl:when>
<!-- Other non-article -->
<xsl:when test="//base_dc:typenorm = '122'">
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
</xsl:when>
<!-- Course material -->
<xsl:when test="//base_dc:typenorm = '16'">
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
</xsl:when>
<!-- Manuscript -->
<xsl:when test="//base_dc:typenorm = '19'">
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
</xsl:when>
<!-- Conference object -->
<xsl:when test="//base_dc:typenorm = '13'">
<dr:CobjCategory type="publication">0004</dr:CobjCategory>
</xsl:when>
<!--<datacite:rightsList> <!-- Unknown -->
<xsl:if test="//base_dc:oa[.='0']"> <xsl:when test="//base_dc:typenorm = 'F'">
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_16ec">restricted access</datacite:rights> <dr:CobjCategory type="other">0000</dr:CobjCategory>
</xsl:if> </xsl:when>
<xsl:if test="//base_dc:oa[.='1']"> <xsl:otherwise>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights> <dr:CobjCategory type="other">0000</dr:CobjCategory>
</xsl:if> </xsl:otherwise>
<xsl:for-each select="//dc:rights|//base_dc:rightsnorm"> </xsl:choose>
<datacite:rights>
<xsl:value-of select="vocabulary:clean(., 'dnet:access_modes')"/> <oaf:accessrights>
</datacite:rights> <xsl:choose>
</xsl:for-each> <xsl:when test="//base_dc:oa[.='0']">CLOSED</xsl:when>
</datacite:rightsList>--> <xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
<xsl:when test="//base_dc:oa[.='2']">UNKNOWN</xsl:when>
<xsl:when test="//base_dc:rightsnorm">
<xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')" />
</xsl:when>
<xsl:when test="//dc:rights">
<xsl:value-of select="vocabulary:clean( //dc:rights, 'dnet:access_modes')" />
</xsl:when>
<xsl:otherwise>UNKNOWN</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
</datacite:resource> <xsl:for-each select="//base_dc:doi">
<oaf:identifier identifierType="doi">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<xsl:for-each select="//dc:relation"> <xsl:for-each
<xsl:if select="distinct-values(//dc:identifier[starts-with(., 'http') and ( not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', 'i')"> <oaf:identifier identifierType="url">
<oaf:projectid> <xsl:value-of select="." />
<xsl:value-of </oaf:identifier>
select="concat($varFP7, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/fp7/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))" </xsl:for-each>
/>
</oaf:projectid>
</xsl:if>
<xsl:if
test="matches(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', 'i')">
<oaf:projectid>
<xsl:value-of
select="concat($varH2020, replace(normalize-space(.), '(info:eu-repo/grantagreement/ec/h2020/)(\d\d\d\d\d\d)(.*)', '$2', 'i'))"
/>
</oaf:projectid>
</xsl:if>
</xsl:for-each>
<xsl:choose> <xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
<!-- I used an inline mapping because the field typenorm could be repeated and I have to specify a list of priority --> <oaf:identifier identifierType="handle">
<xsl:value-of select="." />
</oaf:identifier>
</xsl:for-each>
<!-- Book part --> <xsl:for-each select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
<xsl:when test="//base_dc:typenorm = '111'"> <oaf:identifier identifierType='urn'>
<dr:CobjCategory type="publication" <xsl:value-of select="." />
>0013</dr:CobjCategory> </oaf:identifier>
</xsl:when> </xsl:for-each>
<!-- Book --> <oaf:identifier identifierType="oai-original">
<xsl:when test="//base_dc:typenorm = '11'"> <xsl:value-of
<dr:CobjCategory type="publication" select="//oai:header/oai:identifier" />
>0002</dr:CobjCategory> </oaf:identifier>
</xsl:when>
<!-- Article contribution --> <oaf:hostedBy>
<xsl:when test="//base_dc:typenorm = '121'"> <xsl:attribute name="name">
<dr:CobjCategory type="publication" <xsl:value-of select="//base_dc:collname" />
>0001</dr:CobjCategory> </xsl:attribute>
</xsl:when> <xsl:attribute name="id">
<xsl:value-of select="concat('opendoar____::', //base_dc:collection/@opendoar_id)" />
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName" />
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId" />
</xsl:attribute>
</oaf:collectedFrom>
<!-- Journal/Newspaper --> <oaf:dateAccepted>
<xsl:when test="//base_dc:typenorm = '12'"> <xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )" />
<dr:CobjCategory type="publication" </oaf:dateAccepted>
>0043</dr:CobjCategory>
</xsl:when>
<!-- Report --> <xsl:if test="//base_dc:oa[.='1']">
<xsl:when test="//base_dc:typenorm = '14'"> <xsl:for-each select="//dc:relation[starts-with(., 'http')]">
<dr:CobjCategory type="publication" <oaf:fulltext>
>0017</dr:CobjCategory> <xsl:value-of select="normalize-space(.)" />
</xsl:when> </oaf:fulltext>
</xsl:for-each>
</xsl:if>
<!-- Review --> <xsl:for-each select="//base_dc:collection/@ror_id">
<xsl:when test="//base_dc:typenorm = '15'"> <oaf:relation relType="resultOrganization" subRelType="affiliation" relClass="hasAuthorInstitution" targetType="organization">
<dr:CobjCategory type="publication" <xsl:choose>
>0015</dr:CobjCategory> <xsl:when test="contains(.,'https://ror.org/')">
</xsl:when> <xsl:value-of select="concat('ror_________::', normalize-space(.))" />
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="concat('ror_________::https://ror.org/', normalize-space(.))" />
</xsl:otherwise>
</xsl:choose>
</oaf:relation>
</xsl:for-each>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.89</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:aggregator"
classname="sysimport:crosswalk:aggregator"
schemeid="dnet:provenanceActions"
schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']" />
</record>
</xsl:template>
<!-- Lecture --> <xsl:template match="//*[local-name() = 'header']">
<xsl:when test="//base_dc:typenorm = '17'"> <xsl:if test="//oai:header/@status='deleted'">
<dr:CobjCategory type="publication" <xsl:call-template name="terminate" />
>0010</dr:CobjCategory> </xsl:if>
</xsl:when> <xsl:copy>
<xsl:apply-templates select="node()|@*" />
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate" />
</xsl:element>
</xsl:copy>
</xsl:template>
<!-- Bachelor's thesis --> <xsl:template match="node()|@*">
<xsl:when test="//base_dc:typenorm = '181'"> <xsl:copy>
<dr:CobjCategory type="publication" <xsl:apply-templates select="node()|@*" />
>0008</dr:CobjCategory> </xsl:copy>
</xsl:when> </xsl:template>
</xsl:stylesheet>
<!-- Master's thesis --> </CODE>
<xsl:when test="//base_dc:typenorm = '182'"> </SCRIPT>
<dr:CobjCategory type="publication" </CONFIGURATION>
>0007</dr:CobjCategory> <STATUS />
</xsl:when> <SECURITY_PARAMETERS />
</BODY>
<!-- Doctoral and postdoctoral thesis --> </RESOURCE_PROFILE>
<xsl:when test="//base_dc:typenorm = '183'">
<dr:CobjCategory type="publication"
>0006</dr:CobjCategory>
</xsl:when>
<!-- Thesis -->
<xsl:when test="//base_dc:typenorm = '18'">
<dr:CobjCategory type="publication"
>0044</dr:CobjCategory>
</xsl:when>
<!-- Patent -->
<xsl:when test="//base_dc:typenorm = '1A'">
<dr:CobjCategory type="publication"
>0019</dr:CobjCategory>
</xsl:when>
<!-- Text -->
<xsl:when test="//base_dc:typenorm = '1'">
<dr:CobjCategory type="publication"
>0001</dr:CobjCategory>
</xsl:when>
<!-- Software -->
<xsl:when test="//base_dc:typenorm = '6'">
<dr:CobjCategory type="software">0029</dr:CobjCategory>
</xsl:when>
<!-- Dataset -->
<xsl:when test="//base_dc:typenorm = '7'">
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
</xsl:when>
<!-- Still image -->
<xsl:when test="//base_dc:typenorm = '51'">
<dr:CobjCategory type="other">0025</dr:CobjCategory>
</xsl:when>
<!-- Moving image/Video -->
<xsl:when test="//base_dc:typenorm = '52'">
<dr:CobjCategory type="other">0024</dr:CobjCategory>
</xsl:when>
<!-- Image/Video -->
<xsl:when test="//base_dc:typenorm = '5'">
<dr:CobjCategory type="other">0033</dr:CobjCategory>
</xsl:when>
<!-- Audio -->
<xsl:when test="//base_dc:typenorm = '4'">
<dr:CobjCategory type="other">0030</dr:CobjCategory>
</xsl:when>
<!-- Musical notation -->
<xsl:when test="//base_dc:typenorm = '2'">
<dr:CobjCategory type="other">0020</dr:CobjCategory>
</xsl:when>
<!-- Map -->
<xsl:when test="//base_dc:typenorm = '3'">
<dr:CobjCategory type="other">0020</dr:CobjCategory>
</xsl:when>
<!-- Other non-article -->
<xsl:when test="//base_dc:typenorm = '122'">
<dr:CobjCategory type="publication"
>0038</dr:CobjCategory>
</xsl:when>
<!-- Course material -->
<xsl:when test="//base_dc:typenorm = '16'">
<dr:CobjCategory type="publication"
>0038</dr:CobjCategory>
</xsl:when>
<!-- Manuscript -->
<xsl:when test="//base_dc:typenorm = '19'">
<dr:CobjCategory type="publication"
>0038</dr:CobjCategory>
</xsl:when>
<!-- Conference object -->
<xsl:when test="//base_dc:typenorm = '13'">
<dr:CobjCategory type="publication"
>0004</dr:CobjCategory>
</xsl:when>
<!-- Unknown -->
<xsl:when test="//base_dc:typenorm = 'F'">
<dr:CobjCategory type="other">0000</dr:CobjCategory>
</xsl:when>
<xsl:otherwise>
<dr:CobjCategory type="other">0000</dr:CobjCategory>
</xsl:otherwise>
</xsl:choose>
<oaf:accessrights>
<xsl:choose>
<xsl:when test="//base_dc:oa[.='0']">CLOSED</xsl:when>
<xsl:when test="//base_dc:oa[.='1']">OPEN</xsl:when>
<xsl:when test="//base_dc:oa[.='2']">UNKNOWN</xsl:when>
<xsl:when test="//base_dc:rightsnorm">
<xsl:value-of
select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:access_modes')"
/>
</xsl:when>
<xsl:when test="//dc:rights">
<xsl:value-of
select="vocabulary:clean( //dc:rights, 'dnet:access_modes')"
/>
</xsl:when>
<xsl:otherwise>UNKNOWN</xsl:otherwise>
</xsl:choose>
</oaf:accessrights>
<xsl:if test="//base_dc:rightsnorm">
<oaf:license><xsl:value-of select="vocabulary:clean(//base_dc:rightsnorm, 'dnet:licenses')" /></oaf:license>
</xsl:if>
<xsl:for-each select="//base_dc:doi">
<oaf:identifier identifierType="doi">
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<xsl:for-each
select="distinct-values(//dc:identifier[starts-with(., 'http') and ( not(contains(., '://dx.doi.org/') or contains(., '://doi.org/') or contains(., '://hdl.handle.net/')))])">
<oaf:identifier identifierType="url">
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<xsl:for-each
select="distinct-values(//dc:identifier[starts-with(., 'http') and contains(., '://hdl.handle.net/')]/substring-after(., 'hdl.handle.net/'))">
<oaf:identifier identifierType="handle">
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<xsl:for-each
select="distinct-values(//dc:identifier[starts-with(., 'urn:nbn:nl:') or starts-with(., 'URN:NBN:NL:')])">
<oaf:identifier identifierType="urn">
<xsl:value-of select="."/>
</oaf:identifier>
</xsl:for-each>
<oaf:identifier identifierType="oai-original">
<xsl:value-of select="//oai:header/oai:identifier"/>
</oaf:identifier>
<oaf:hostedBy>
<xsl:attribute name="name">
<xsl:value-of select="//base_dc:collname"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of
select="concat('opendoar____::', //base_dc:collection/@opendoar_id)"
/>
</xsl:attribute>
</oaf:hostedBy>
<oaf:collectedFrom>
<xsl:attribute name="name">
<xsl:value-of select="$varOfficialName"/>
</xsl:attribute>
<xsl:attribute name="id">
<xsl:value-of select="$varDataSourceId"/>
</xsl:attribute>
</oaf:collectedFrom>
<oaf:dateAccepted>
<xsl:value-of select="dateCleaner:dateISO( //dc:date[1] )"/>
</oaf:dateAccepted>
<xsl:if test="//base_dc:oa[.='1']">
<xsl:for-each select="//dc:relation[starts-with(., 'http')]">
<oaf:fulltext>
<xsl:value-of select="normalize-space(.)"/>
</oaf:fulltext>
</xsl:for-each>
</xsl:if>
<xsl:for-each select="//base_dc:collection/@ror_id">
<oaf:relation relType="resultOrganization"
subRelType="affiliation" relClass="hasAuthorInstitution"
targetType="organization">
<xsl:choose>
<xsl:when test="contains(.,'https://ror.org/')">
<xsl:value-of
select="concat('ror_________::', normalize-space(.))"
/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of
select="concat('ror_________::https://ror.org/', normalize-space(.))"
/>
</xsl:otherwise>
</xsl:choose>
</oaf:relation>
</xsl:for-each>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.89</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction
classid="sysimport:crosswalk:aggregator"
classname="sysimport:crosswalk:aggregator"
schemeid="dnet:provenanceActions"
schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</metadata>
<xsl:copy-of select="//*[local-name() = 'about']"/>
</record>
</xsl:template>
<xsl:template match="//*[local-name() = 'header']">
<xsl:if test="//oai:header/@status='deleted'">
<xsl:call-template name="terminate"/>
</xsl:if>
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
<xsl:element name="dr:dateOfTransformation">
<xsl:value-of select="$transDate"/>
</xsl:element>
</xsl:copy>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>
</CODE>
</SCRIPT>
</CONFIGURATION>
<STATUS/>
<SECURITY_PARAMETERS/>
</BODY>
</RESOURCE_PROFILE>

View File

@ -332,7 +332,7 @@ case object Crossref2Oaf {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID //MAPPING Crossref DOI into PID
val doi: String = DoiCleaningRule.clean((json \ "DOI").extract[String]) val doi: String = DoiCleaningRule.normalizeDoi((json \ "DOI").extract[String])
result.setPid( result.setPid(
List( List(
structuredProperty( structuredProperty(
@ -504,24 +504,6 @@ case object Crossref2Oaf {
) )
} }
val is_review = json \ "relation" \ "is-review-of" \ "id"
if (is_review != JNothing) {
instance.setInstancetype(
OafMapperUtils.qualifier(
"0015",
"peerReviewed",
ModelConstants.DNET_REVIEW_LEVELS,
ModelConstants.DNET_REVIEW_LEVELS
)
)
}
if (doi.startsWith("10.3410") || doi.startsWith("10.12703"))
instance.setHostedby(
OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect")
)
instance.setAccessright( instance.setAccessright(
decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue) decideAccessRight(instance.getLicense, result.getDateofacceptance.getValue)
) )
@ -673,7 +655,7 @@ case object Crossref2Oaf {
val doi = input.getString(0) val doi = input.getString(0)
val rorId = input.getString(1) val rorId = input.getString(1)
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}" val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.normalizeDoi(doi)}"
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId) val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
val r: Relation = new Relation val r: Relation = new Relation
@ -906,11 +888,7 @@ case object Crossref2Oaf {
val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("cihr________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
// Added mapping for DFG
case "10.13039/501100001659" =>
val targetId = getProjectId("dfgf________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES)
case "10.13039/100020031" => case "10.13039/100020031" =>
val targetId = getProjectId("tara________", "1e5e62235d094afd01cd56e65112fc63") val targetId = getProjectId("tara________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY)
@ -1047,7 +1025,6 @@ case object Crossref2Oaf {
tp._1 match { tp._1 match {
case "electronic" => journal.setIssnOnline(tp._2) case "electronic" => journal.setIssnOnline(tp._2)
case "print" => journal.setIssnPrinted(tp._2) case "print" => journal.setIssnPrinted(tp._2)
case _ =>
} }
}) })
} }

View File

@ -2,9 +2,12 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import eu.dnetlib.dhp.sx.bio.pubmed._ import eu.dnetlib.dhp.sx.bio.pubmed._
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
@ -14,13 +17,13 @@ import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClientBuilder import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.io.{ByteArrayInputStream, InputStream} import java.io.InputStream
import java.nio.charset.Charset import scala.io.Source
import javax.xml.stream.XMLInputFactory import scala.xml.pull.XMLEventReader
object SparkCreateBaselineDataFrame { object SparkCreateBaselineDataFrame {
@ -83,7 +86,7 @@ object SparkCreateBaselineDataFrame {
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
tries -= 1 tries -= 1
} else } else
return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset()) return IOUtils.toString(response.getEntity.getContent)
} catch { } catch {
case e: Throwable => case e: Throwable =>
println(s"Error on requesting ${r.getURI}") println(s"Error on requesting ${r.getURI}")
@ -155,8 +158,7 @@ object SparkCreateBaselineDataFrame {
IOUtils.toString( IOUtils.toString(
SparkEBILinksToOaf.getClass.getResourceAsStream( SparkEBILinksToOaf.getClass.getResourceAsStream(
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
), )
Charset.defaultCharset()
) )
) )
parser.parseArgument(args) parser.parseArgument(args)
@ -165,11 +167,15 @@ object SparkCreateBaselineDataFrame {
val workingPath = parser.get("workingPath") val workingPath = parser.get("workingPath")
log.info("workingPath: {}", workingPath) log.info("workingPath: {}", workingPath)
val targetPath = parser.get("targetPath") val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info("targetPath: {}", targetPath) log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val hdfsServerUri = parser.get("hdfsServerUri") val hdfsServerUri = parser.get("hdfsServerUri")
log.info("hdfsServerUri: {}", targetPath) log.info("hdfsServerUri: {}", hdfsServerUri)
val skipUpdate = parser.get("skipUpdate") val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate) log.info("skipUpdate: {}", skipUpdate)
@ -195,11 +201,10 @@ object SparkCreateBaselineDataFrame {
if (!"true".equalsIgnoreCase(skipUpdate)) { if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
val inputFactory = XMLInputFactory.newInstance
val ds: Dataset[PMArticle] = spark.createDataset( val ds: Dataset[PMArticle] = spark.createDataset(
k.filter(i => i._1.endsWith(".gz")) k.filter(i => i._1.endsWith(".gz"))
.flatMap(i => { .flatMap(i => {
val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes())) val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
new PMParser(xml) new PMParser(xml)
}) })
) )
@ -218,8 +223,11 @@ object SparkCreateBaselineDataFrame {
.map(a => PubMedToOaf.convert(a, vocabularies)) .map(a => PubMedToOaf.convert(a, vocabularies))
.as[Oaf] .as[Oaf]
.filter(p => p != null), .filter(p => p != null),
targetPath s"$outputBasePath/$MDSTORE_DATA_PATH"
) )
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
val mdStoreSize = df.count
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
} }
} }

View File

@ -1,8 +1,7 @@
package eu.dnetlib.dhp.sx.bio.pubmed package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData import scala.xml.MetaData
import javax.xml.stream.XMLEventReader import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
/** @param xml /** @param xml
*/ */

View File

@ -30,7 +30,6 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class PrepareAffiliationRelationsTest { public class PrepareAffiliationRelationsTest {
@ -75,34 +74,26 @@ public class PrepareAffiliationRelationsTest {
@Test @Test
void testMatch() throws Exception { void testMatch() throws Exception {
String crossrefAffiliationRelationPathNew = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath();
String crossrefAffiliationRelationPath = getClass() String crossrefAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror_old.json") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath(); .getPath();
String publisherAffiliationRelationPath = getClass() String publisherAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
.getPath(); .getPath();
String publisherAffiliationRelationOldPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publichers_old")
.getPath();
String outputPath = workingDir.toString() + "/actionSet"; String outputPath = workingDir.toString() + "/actionSet";
PrepareAffiliationRelations PrepareAffiliationRelations
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-crossrefInputPath", crossrefAffiliationRelationPathNew, "-crossrefInputPath", crossrefAffiliationRelationPath,
"-pubmedInputPath", crossrefAffiliationRelationPath, "-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPathNew, "-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath, "-webCrawlInputPath", crossrefAffiliationRelationPath,
"-publisherInputPath", publisherAffiliationRelationOldPath, "-publisherInputPath", publisherAffiliationRelationPath,
"-outputPath", outputPath "-outputPath", outputPath
}); });
@ -113,8 +104,13 @@ public class PrepareAffiliationRelationsTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
// for (Relation r : tmp.collect()) {
// System.out.println(
// r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
// );
// }
// count the number of relations // count the number of relations
assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 = assertEquals(138, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result"); dataset.createOrReplaceTempView("result");
@ -125,7 +121,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations // verify that we have equal number of bi-directional relations
Assertions Assertions
.assertEquals( .assertEquals(
75, execVerification 69, execVerification
.filter( .filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList() .collectAsList()
@ -133,21 +129,21 @@ public class PrepareAffiliationRelationsTest {
Assertions Assertions
.assertEquals( .assertEquals(
75, execVerification 69, execVerification
.filter( .filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList() .collectAsList()
.size()); .size());
// check confidence value of a specific relation // check confidence value of a specific relation
String sourceDOI = "10.1089/10872910260066679"; String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)";
final String sourceOpenaireId = ID_PREFIX final String sourceOpenaireId = ID_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", sourceDOI)); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
Assertions Assertions
.assertEquals( .assertEquals(
"1.0", execVerification "0.7071067812", execVerification
.filter( .filter(
"source='" + sourceOpenaireId + "'") "source='" + sourceOpenaireId + "'")
.collectAsList() .collectAsList()
@ -155,34 +151,11 @@ public class PrepareAffiliationRelationsTest {
.getString(4)); .getString(4));
final String publisherid = ID_PREFIX final String publisherid = ID_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1089/10872910260066679")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s00217-010-1268-9"));
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/05cf8a891"); final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/03265fv13");
Assertions Assertions
.assertEquals( .assertEquals(
2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count()); 1, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
Assertions
.assertEquals(
1, execVerification
.filter(
"source = '" + ID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue("doi", "10.1007/s00217-010-1268-9"))
+ "' and target = '" + "20|ror_________::"
+ IdentifierFactory.md5("https://ror.org/03265fv13") + "'")
.count());
Assertions
.assertEquals(
3, execVerification
.filter(
"source = '" + ID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue("doi", "10.1007/3-540-47984-8_14"))
+ "' and target = '" + "20|ror_________::"
+ IdentifierFactory.md5("https://ror.org/00a0n9e72") + "'")
.count());
} }
} }

View File

@ -15,7 +15,10 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -31,7 +31,6 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class CreateOpenCitationsASTest { public class CreateOpenCitationsASTest {
@ -281,17 +280,17 @@ public class CreateOpenCitationsASTest {
@Test @Test
void testRelationsSourceTargetCouple() throws Exception { void testRelationsSourceTargetCouple() throws Exception {
final String doi1 = "50|doi_________::" final String doi1 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
final String doi2 = "50|doi_________::" final String doi2 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
final String doi3 = "50|doi_________::" final String doi3 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
final String doi4 = "50|doi_________::" final String doi4 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
final String doi5 = "50|doi_________::" final String doi5 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
final String doi6 = "50|doi_________::" final String doi6 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
String inputPath = getClass() String inputPath = getClass()
.getResource( .getResource(

View File

@ -119,9 +119,7 @@ public class ReadCOCITest {
workingDir.toString() + "/COCI", workingDir.toString() + "/COCI",
"-outputPath", "-outputPath",
workingDir.toString() + "/COCI_json/", workingDir.toString() + "/COCI_json/",
"-inputFile", "input1;input2;input3;input4;input5", "-inputFile", "input1;input2;input3;input4;input5"
"-format",
"COCI"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

View File

@ -77,13 +77,13 @@ public class RemapTest {
MapOCIdsInPids MapOCIdsInPids
.main( .main(
new String[] { new String[] {
"--isSparkSessionManged", "-isSparkSessionManged",
Boolean.FALSE.toString(), Boolean.FALSE.toString(),
"--inputPath", "-inputPath",
inputPath, inputPath,
"--outputPath", "-outputPath",
workingDir.toString() + "/out/", workingDir.toString() + "/out/",
"--nameNode", "hdfs://localhost" "-nameNode", "input1;input2;input3;input4;input5"
}); });
} }

View File

@ -1,213 +0,0 @@
package eu.dnetlib.dhp.actionmanager.person;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Person;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
public class CreatePersonAS {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(CreatePersonAS.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(CreatePersonAS.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(CreatePersonAS.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.codegen.wholeStage", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(CreatePersonAS.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testAuthors() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/person/")
.getPath();
// spark
// .read()
// .parquet(inputPath + "Authors")
// .as(Encoders.bean(Author.class))
// .filter((FilterFunction<Author>) a -> Optional.ofNullable(a.getOtherNames()).isPresent() &&
// Optional.ofNullable(a.getBiography()).isPresent())
// .write()
// .mode(SaveMode.Overwrite)
// .parquet(workingDir.toString() + "AuthorsSubset");
ExtractPerson
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet1",
"-workingDir",
workingDir.toString() + "/working"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> relations = sc
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
.filter(v -> "eu.dnetlib.dhp.schema.oaf.Relation".equalsIgnoreCase(v._1().toString()))
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
//
JavaRDD<Person> people = sc
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
.filter(v -> "eu.dnetlib.dhp.schema.oaf.Person".equalsIgnoreCase(v._1().toString()))
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Person) aa.getPayload()));
//
Assertions.assertEquals(7, people.count());
Assertions
.assertEquals(
"Paulo",
people
.filter(
p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
.first()
.getGivenName());
Assertions
.assertEquals(
"Tavares",
people
.filter(
p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
.first()
.getFamilyName());
Assertions
.assertEquals(
4,
people
.filter(
p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
.first()
.getAlternativeNames()
.size());
Assertions
.assertEquals(
4,
people
.filter(
p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
.first()
.getPid()
.size());
Assertions
.assertTrue(
people
.filter(
p -> p.getPid().stream().anyMatch(id -> id.getValue().equalsIgnoreCase("0000-0002-3210-3034")))
.first()
.getPid()
.stream()
.anyMatch(
p -> p.getQualifier().getSchemename().equalsIgnoreCase("Scopus Author ID")
&& p.getValue().equalsIgnoreCase("15119405200")));
Assertions
.assertEquals(
16,
relations
.filter(r -> r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED))
.count());
Assertions
.assertEquals(
14,
relations
.filter(r -> r.getRelClass().equalsIgnoreCase(ModelConstants.PERSON_PERSON_HASCOAUTHORED))
.count());
Assertions
.assertEquals(
3,
relations
.filter(
r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED))
.count());
Assertions
.assertEquals(
2,
relations
.filter(
r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED)
&& r.getTarget().startsWith("50|doi"))
.count());
Assertions
.assertEquals(
1,
relations
.filter(
r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
&& r.getRelClass().equalsIgnoreCase(ModelConstants.RESULT_PERSON_HASAUTHORED)
&& r.getTarget().startsWith("50|arXiv"))
.count());
Assertions
.assertEquals(
1,
relations
.filter(
r -> r.getSource().equalsIgnoreCase("30|orcid_______::" + DHPUtils.md5("0000-0001-6291-9619"))
&& r.getRelClass().equalsIgnoreCase(ModelConstants.PERSON_PERSON_HASCOAUTHORED))
.count());
Assertions.assertEquals(33, relations.count());
}
}

View File

@ -28,7 +28,6 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
@ -271,17 +270,17 @@ public class CreateTAActionSetTest {
@Test @Test
void testRelationsSourceTargetCouple() throws Exception { void testRelationsSourceTargetCouple() throws Exception {
final String doi1 = "50|doi_________::" final String doi1 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
final String doi2 = "50|doi_________::" final String doi2 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
final String doi3 = "50|doi_________::" final String doi3 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
final String doi4 = "50|doi_________::" final String doi4 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
final String doi5 = "50|doi_________::" final String doi5 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
final String doi6 = "50|doi_________::" final String doi6 = "50|doi_________::"
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5")); + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
String inputPath = getClass() String inputPath = getClass()
.getResource( .getResource(

View File

@ -2,7 +2,6 @@
package eu.dnetlib.dhp.actionmanager.webcrawl; package eu.dnetlib.dhp.actionmanager.webcrawl;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -102,10 +101,7 @@ public class CreateASTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r))); Assertions.assertEquals(58, tmp.count());
tmp.foreach(r -> assertTrue(r.getSource().startsWith("20|ror") || r.getSource().startsWith("50|doi")));
tmp.foreach(r -> assertTrue(r.getTarget().startsWith("20|ror") || r.getTarget().startsWith("50|doi")));
Assertions.assertEquals(24, tmp.count());
} }
@ -116,7 +112,7 @@ public class CreateASTest {
String inputPath = getClass() String inputPath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/actionmanager/webcrawl/input/") "/eu/dnetlib/dhp/actionmanager/webcrawl/")
.getPath(); .getPath();
String blackListPath = getClass() String blackListPath = getClass()
.getResource( .getResource(
@ -198,7 +194,7 @@ public class CreateASTest {
Assertions Assertions
.assertEquals( .assertEquals(
1, tmp 2, tmp
.filter( .filter(
r -> r r -> r
.getSource() .getSource()
@ -211,7 +207,7 @@ public class CreateASTest {
Assertions Assertions
.assertEquals( .assertEquals(
1, tmp 2, tmp
.filter( .filter(
r -> r r -> r
.getTarget() .getTarget()
@ -232,13 +228,13 @@ public class CreateASTest {
"20|ror_________::" + IdentifierFactory "20|ror_________::" + IdentifierFactory
.md5( .md5(
PidCleaner PidCleaner
.normalizePidValue("ROR", "https://ror.org/03265fv13"))) .normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
&& r.getSource().startsWith("50|doi")) && r.getSource().startsWith("50|doi"))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
0, tmp 1, tmp
.filter( .filter(
r -> r r -> r
.getTarget() .getTarget()
@ -272,10 +268,6 @@ public class CreateASTest {
.getResource( .getResource(
"/eu/dnetlib/dhp/actionmanager/webcrawl") "/eu/dnetlib/dhp/actionmanager/webcrawl")
.getPath(); .getPath();
String blackListPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
.getPath();
CreateActionSetFromWebEntries CreateActionSetFromWebEntries
.main( .main(
@ -285,8 +277,7 @@ public class CreateASTest {
"-sourcePath", "-sourcePath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet1", workingDir.toString() + "/actionSet1"
"-blackListPath", blackListPath
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

View File

@ -1,108 +0,0 @@
package eu.dnetlib.dhp.actionmanager.webcrawl;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
/**
* @author miriam.baglioni
* @Date 22/04/24
*/
public class RemoveFromASTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(RemoveFromASTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(RemoveFromASTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(RemoveFromASTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(RemoveFromASTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testNumberofRelations() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/webcrawl/actionSet/")
.getPath();
String blackListPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackListRemove/")
.getPath();
RemoveRelationFromActionSet
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-sourcePath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet1",
"-blackListPath", blackListPath
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
Assertions.assertEquals(22, tmp.count());
}
}

View File

@ -1,64 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.IOException;
import java.util.HashMap;
import java.util.Objects;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ExtendWith(MockitoExtension.class)
public class FileGZipMultipleNodeTest {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
private final ApiDescriptor api = new ApiDescriptor();
private FileGZipCollectorPlugin plugin;
private static final String SPLIT_ON_ELEMENT = "incollection,article";
@BeforeEach
public void setUp() throws IOException {
final String gzipFile = Objects
.requireNonNull(
this
.getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
.getFile();
api.setBaseUrl(gzipFile);
HashMap<String, String> params = new HashMap<>();
params.put("splitOnElement", SPLIT_ON_ELEMENT);
api.setParams(params);
FileSystem fs = FileSystem.get(new Configuration());
plugin = new FileGZipCollectorPlugin(fs);
}
@Test
void test() throws CollectorException {
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
stream.limit(10).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
log.info(s);
});
}
}

View File

@ -1,103 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.gtr2;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.util.Iterator;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
class Gtr2PublicationsIteratorTest {
private static final String baseURL = "https://gtr.ukri.org/gtr/api";
private static final HttpClientParams clientParams = new HttpClientParams();
@Test
@Disabled
public void testOne() throws Exception {
System.out.println("one publication");
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, null, null, clientParams);
if (iterator.hasNext()) {
final String res = iterator.next();
assertNotNull(res);
System.out.println(res);
}
}
@Test
@Disabled
public void testPaging() throws Exception {
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "2", "2", clientParams);
while (iterator.hasNext()) {
Thread.sleep(300);
final String res = iterator.next();
assertNotNull(res);
System.out.println(res);
}
}
@Test
@Disabled
public void testOnePage() throws Exception {
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, "12", "12", clientParams);
final int count = iterateAndCount(iterator);
assertEquals(20, count);
}
@Test
@Disabled
public void testIncrementalHarvestingNoRecords() throws Exception {
System.out.println("incremental Harvesting");
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, "2050-12-12T", "11", "13",
clientParams);
final int count = iterateAndCount(iterator);
assertEquals(0, count);
}
@Test
@Disabled
public void testIncrementalHarvesting() throws Exception {
System.out.println("incremental Harvesting");
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, "2016-11-30", "11", "11", clientParams);
final int count = iterateAndCount(iterator);
assertEquals(20, count);
}
@Test
@Disabled
public void testCompleteHarvesting() throws Exception {
System.out.println("testing complete harvesting");
final Iterator<String> iterator = new Gtr2PublicationsIterator(baseURL, null, null, null, clientParams);
// TryIndentXmlString indenter = new TryIndentXmlString();
// it.setEndAtPage(3);
while (iterator.hasNext()) {
final String res = iterator.next();
assertNotNull(res);
// System.out.println(res);
// Scanner keyboard = new Scanner(System.in);
// System.out.println("press enter for next record");
// keyboard.nextLine();
}
}
private int iterateAndCount(final Iterator<String> iterator) throws Exception {
int i = 0;
while (iterator.hasNext()) {
assertNotNull(iterator.next());
i++;
}
System.out.println("Got " + i + " publications");
return i;
}
}

View File

@ -1,122 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.osf;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream;
import org.dom4j.DocumentHelper;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
public class OsfPreprintsCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(OsfPreprintsCollectorPlugin.class);
private final String baseUrl = "https://api.osf.io/v2/preprints/";
private final int pageSize = 100;
private final ApiDescriptor api = new ApiDescriptor();
private OsfPreprintsCollectorPlugin plugin;
@BeforeEach
public void setUp() {
final HashMap<String, String> params = new HashMap<>();
params.put("pageSize", "" + this.pageSize);
this.api.setBaseUrl(this.baseUrl);
this.api.setParams(params);
this.plugin = new OsfPreprintsCollectorPlugin(new HttpClientParams());
}
@Test
@Disabled
void test_one() throws CollectorException {
this.plugin
.collect(this.api, new AggregatorReport())
.limit(1)
.forEach(log::info);
}
@Test
@Disabled
void test_limited() throws CollectorException {
final AtomicInteger i = new AtomicInteger(0);
final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
stream.limit(2000).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
i.incrementAndGet();
log.info(s);
});
log.info("{}", i.intValue());
Assertions.assertTrue(i.intValue() > 0);
}
@Test
@Disabled
void test_all() throws CollectorException {
final AtomicLong i = new AtomicLong(0);
final Stream<String> stream = this.plugin.collect(this.api, new AggregatorReport());
stream.forEach(s -> {
Assertions.assertTrue(s.length() > 0);
if ((i.incrementAndGet() % 1000) == 0) {
log.info("COLLECTED: {}", i.get());
}
});
log.info("TOTAL: {}", i.get());
Assertions.assertTrue(i.get() > 0);
}
@Test
@Disabled
void test_authentication_required() {
final HttpConnector2 connector = new HttpConnector2();
try {
final String res = connector
.getInputSource("https://api.osf.io/v2/preprints/ydtzx/contributors/?format=json");
System.out.println(res);
fail();
} catch (final Throwable e) {
System.out.println("**** ERROR: " + e.getMessage());
if ((e instanceof CollectorException) && e.getMessage().contains("401")) {
System.out.println(" XML: " + DocumentHelper.createDocument().getRootElement().detach());
}
assertTrue(e.getMessage().contains("401"));
}
}
@Test
void testXML() {
final String xml = JsonUtils.convertToXML("{'next':null}");
System.out.println(xml);
}
}

View File

@ -1,58 +0,0 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiCollectorPluginTest {
private final ResearchFiCollectorPlugin plugin = new ResearchFiCollectorPlugin();
@Test
@Disabled
void testCollect() throws CollectorException {
final ApiDescriptor api = new ApiDescriptor();
api.setBaseUrl("https://research.fi/api/rest/v1/funding-decisions?FunderName=AKA&FundingStartYearFrom=2022");
api.setProtocol("research_fi");
api
.getParams()
.put("auth_url", "https://researchfi-auth.2.rahtiapp.fi/realms/publicapi/protocol/openid-connect/token");
api.getParams().put("auth_client_id", "");
api.getParams().put("auth_client_secret", "");
final AtomicLong count = new AtomicLong(0);
final Set<String> ids = new HashSet<>();
this.plugin.collect(api, new AggregatorReport()).forEach(s -> {
if (count.getAndIncrement() == 0) {
System.out.println("First: " + s);
}
try {
final String id = DocumentHelper.parseText(s).valueOf("/recordWrap/funderProjectNumber");
if (ids.contains(id)) {
System.out.println("Id already present: " + id);
}
ids.add(id);
} catch (final DocumentException e) {
throw new RuntimeException(e);
}
});
System.out.println("Total records: " + count);
System.out.println("Total identifiers: " + ids.size());
}
}

View File

@ -0,0 +1,105 @@
package eu.dnetlib.dhp.collection.plugin.rest;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
public class OsfPreprintCollectorTest {
private static final Logger log = LoggerFactory.getLogger(OsfPreprintCollectorTest.class);
private final String baseUrl = "https://api.osf.io/v2/preprints/";
// private final String requestHeaderMap = "";
// private final String authMethod = "";
// private final String authToken = "";
// private final String resultOutputFormat = "";
private final String queryParams = "filter:is_published:d=true";
private final String entityXpath = "/*/*[local-name()='data']";
private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
private final String resumptionParam = "page";
private final String resumptionType = "page";
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
private final String resultSizeParam = "page[size]";
private final String resultSizeValue = "100";
private final String resultFormatParam = "format";
private final String resultFormatValue = "json";
private final ApiDescriptor api = new ApiDescriptor();
private RestCollectorPlugin rcp;
@BeforeEach
public void setUp() {
final HashMap<String, String> params = new HashMap<>();
params.put("resumptionType", this.resumptionType);
params.put("resumptionParam", this.resumptionParam);
params.put("resumptionXpath", this.resumptionXpath);
params.put("resultTotalXpath", this.resultTotalXpath);
params.put("resultFormatParam", this.resultFormatParam);
params.put("resultFormatValue", this.resultFormatValue);
params.put("resultSizeParam", this.resultSizeParam);
params.put("resultSizeValue", this.resultSizeValue);
params.put("queryParams", this.queryParams);
params.put("entityXpath", this.entityXpath);
this.api.setBaseUrl(this.baseUrl);
this.api.setParams(params);
this.rcp = new RestCollectorPlugin(new HttpClientParams());
}
@Test
@Disabled
void test_limited() throws CollectorException {
final AtomicInteger i = new AtomicInteger(0);
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
stream.limit(2000).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
i.incrementAndGet();
log.info(s);
});
log.info("{}", i.intValue());
Assertions.assertTrue(i.intValue() > 0);
}
@Test
@Disabled
void test_all() throws CollectorException {
final AtomicLong i = new AtomicLong(0);
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
stream.forEach(s -> {
Assertions.assertTrue(s.length() > 0);
if ((i.incrementAndGet() % 1000) == 0) {
log.info("COLLECTED: {}", i.get());
}
});
log.info("TOTAL: {}", i.get());
Assertions.assertTrue(i.get() > 0);
}
}

View File

@ -1,10 +1,9 @@
{"DOI":"10.1021\/ac020069k","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/01f5ytq51","Status":"active","Confidence":1}]} {"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
{"DOI":"10.1161\/01.cir.0000013846.72805.7e","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02pttbw34","Status":"active","Confidence":1}]} {"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
{"DOI":"10.1161\/hy02t2.102992","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/00qqv6244","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/00p991c53","Status":"active","Confidence":1}]} {"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
{"DOI":"10.1126\/science.1073633","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03xez1567","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/006w34k90","Status":"active","Confidence":1}]} {"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
{"DOI":"10.1089\/10872910260066679","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/05cf8a891","Status":"active","Confidence":1}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1108\/02656719610116117","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03mnm0t94","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/007tn5k56","Status":"active","Confidence":1}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1080\/01443610050111986","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/001x4vz59","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/01tmqtf75","Status":"active","Confidence":1}]} {"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI":"10.1021\/cm020118+","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02cf1je33","Confidence":1,"Status":"inactive"},{"PID":"ROR","Value":"https:\/\/ror.org\/01hvx5h04","Confidence":1,"Status":"active"}]} {"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI":"10.1161\/hc1202.104524","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/040r8fr65","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/04fctr677","Status":"active","Confidence":1}]} {"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
{"DOI":"10.1021\/ma011134f","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/04tj63d06","Status":"active","Confidence":1}]}

View File

@ -1,9 +0,0 @@
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI": "https://doi.org/10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}

Some files were not shown because too many files have changed in this diff Show More