forked from D-Net/dnet-hadoop
Compare commits
98 Commits
main
...
scholix_sm
Author | SHA1 | Date |
---|---|---|
Sandro La Bruzzo | 2a7e5de094 | |
Sandro La Bruzzo | 844a31f7a6 | |
Sandro La Bruzzo | 0c934d3c39 | |
Claudio Atzori | a02f3f0d2b | |
Alessia Bardi | eadfd8d71d | |
Alessia Bardi | 05ee783c07 | |
Alessia Bardi | fe9fb59c90 | |
Claudio Atzori | c272c4ad68 | |
Alessia Bardi | c5f4da16a4 | |
Alessia | 1b165a14a0 | |
Michele Artini | e996787be2 | |
Claudio Atzori | 62716141c5 | |
Miriam Baglioni | 5d85b70e1f | |
Giambattista Bloisi | 73316d8c83 | |
Miriam Baglioni | 75d5ddb999 | |
Miriam Baglioni | 87c9c61b41 | |
Miriam Baglioni | b55fed09f8 | |
Claudio Atzori | 107d958b89 | |
Claudio Atzori | 3a7a6ecc32 | |
Claudio Atzori | 1af4224d3d | |
Claudio Atzori | 0d5bdb2db0 | |
Claudio Atzori | 66548e6a83 | |
Giambattista Bloisi | 1b2357e10a | |
Sandro La Bruzzo | f1fe363b19 | |
Sandro La Bruzzo | 66c1ffc866 | |
Claudio Atzori | 1ea67eba82 | |
Claudio Atzori | f9fb2fef6e | |
Claudio Atzori | 834461ba26 | |
Sandro La Bruzzo | e8a61d5dd5 | |
Sandro La Bruzzo | ca9414b737 | |
Sandro La Bruzzo | 032bcc8279 | |
Sandro La Bruzzo | 103e2652b3 | |
Sandro La Bruzzo | a87f9ea643 | |
Sandro La Bruzzo | 6efab4d88e | |
Claudio Atzori | 92f018d196 | |
Claudio Atzori | 0611c81a2f | |
Claudio Atzori | 1efe7f7e39 | |
Claudio Atzori | 53e7bb4336 | |
Claudio Atzori | f7d56e2ef2 | |
Claudio Atzori | c1237ab39e | |
Claudio Atzori | dc3a5858f7 | |
Claudio Atzori | 55f39f7850 | |
Claudio Atzori | 39a2afe8b5 | |
Claudio Atzori | 908ed9da7a | |
Claudio Atzori | 18aa323ee9 | |
Claudio Atzori | b4e3389432 | |
Giambattista Bloisi | 711048ceed | |
Claudio Atzori | 26363060ed | |
Claudio Atzori | 0486227185 | |
Claudio Atzori | a5d13d5d27 | |
Claudio Atzori | e1a0fb8933 | |
Giambattista Bloisi | 69c5efbd8b | |
Sandro La Bruzzo | db358ad0d2 | |
Sandro La Bruzzo | 26bf8e763a | |
Sandro La Bruzzo | a860c57bbc | |
Sandro La Bruzzo | 0646d0d064 | |
Claudio Atzori | 00ad21d814 | |
Claudio Atzori | 4355f64810 | |
Claudio Atzori | 66680b8b9a | |
Claudio Atzori | dcf23b3d06 | |
Michele Artini | f4068de298 | |
Claudio Atzori | 11bd89e132 | |
Claudio Atzori | e96c2c1606 | |
Claudio Atzori | 50c18f7a0b | |
Michele Artini | 2615136efc | |
Sandro La Bruzzo | 133ead1e3e | |
Sandro La Bruzzo | 052c6aac9d | |
Sandro La Bruzzo | 9cd3bc0f10 | |
Claudio Atzori | c08a58bba8 | |
Claudio Atzori | e2937db385 | |
Giambattista Bloisi | 1878199dae | |
Sandro La Bruzzo | 0d628cd62b | |
Claudio Atzori | c3053ef34d | |
Claudio Atzori | b5bcab13ec | |
Claudio Atzori | 425c9afc36 | |
Claudio Atzori | 93dd9cc639 | |
Miriam Baglioni | 6189879643 | |
Claudio Atzori | c57cff2d6d | |
Miriam Baglioni | 7de114bda0 | |
Claudio Atzori | eb4692e4ee | |
Claudio Atzori | 24a83fc24f | |
Sandro La Bruzzo | 073f320c6a | |
Miriam Baglioni | 776c898c4b | |
Claudio Atzori | 5857fd38c1 | |
Claudio Atzori | 0656ab2838 | |
Claudio Atzori | ab7f0855af | |
Claudio Atzori | 7a7e313157 | |
Claudio Atzori | e5879b68c7 | |
Claudio Atzori | 3a027e97a7 | |
Sandro La Bruzzo | b72c3139e2 | |
Sandro La Bruzzo | b84ad0c06e | |
Sandro La Bruzzo | 8dd9cf84e2 | |
Sandro La Bruzzo | 342cb6189b | |
Giambattista Bloisi | 613ec5ffce | |
Sandro La Bruzzo | 52495f2cd2 | |
Sandro La Bruzzo | 8c3e9a09d3 | |
Giambattista Bloisi | 2fa78f6071 | |
Giambattista Bloisi | 326c9dc08c |
|
@ -27,3 +27,4 @@ spark-warehouse
|
||||||
/**/.factorypath
|
/**/.factorypath
|
||||||
/**/.scalafmt.conf
|
/**/.scalafmt.conf
|
||||||
/.java-version
|
/.java-version
|
||||||
|
/dhp-shade-package/dependency-reduced-pom.xml
|
||||||
|
|
|
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
|
||||||
mojo.outputFile = testFolder;
|
mojo.outputFile = testFolder;
|
||||||
|
|
||||||
// execute
|
// execute
|
||||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
try {
|
||||||
|
mojo.execute();
|
||||||
|
Assertions.assertTrue(false); // not reached
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assertions
|
||||||
|
.assertTrue(
|
||||||
|
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
|
||||||
|
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -63,15 +63,14 @@
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>dhp-pace-core</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
<version>${project.version}</version>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.ibm.icu</groupId>
|
||||||
|
<artifactId>icu4j</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.hadoop</groupId>
|
|
||||||
<artifactId>hadoop-common</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
|
@ -161,7 +160,7 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>${dhp-schemas.artifact}</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -170,4 +169,23 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<!-- dependencies required on JDK9+ because J2EE has been removed -->
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>javax.xml.bind</groupId>
|
||||||
|
<artifactId>jaxb-api</artifactId>
|
||||||
|
<version>2.2.11</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.sun.xml.ws</groupId>
|
||||||
|
<artifactId>jaxws-ri</artifactId>
|
||||||
|
<version>2.3.3</version>
|
||||||
|
<type>pom</type>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class PacePerson {
|
||||||
PacePerson.class
|
PacePerson.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
|
||||||
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
||||||
* concept_rec_id = 656930
|
* concept_rec_id = 656930
|
||||||
* @return response code
|
* @return response code
|
||||||
* @throws IOException
|
|
||||||
* @throws MissingConceptDoiException
|
|
||||||
*/
|
*/
|
||||||
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
||||||
setDepositionId(concept_rec_id, 1);
|
setDepositionId(concept_rec_id, 1);
|
||||||
|
|
|
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
import org.apache.commons.lang3.time.DateUtils;
|
|
||||||
import org.apache.http.HttpHeaders;
|
import org.apache.http.HttpHeaders;
|
||||||
import org.joda.time.Instant;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import com.wcohen.ss.JaroWinkler;
|
import com.wcohen.ss.JaroWinkler;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -146,10 +147,20 @@ public class AuthorMerger {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String pidToComparableString(StructuredProperty pid) {
|
public static String pidToComparableString(StructuredProperty pid) {
|
||||||
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
|
final String classId = Optional
|
||||||
: "";
|
.ofNullable(pid)
|
||||||
return (pid.getQualifier() != null ? classid : "")
|
.map(
|
||||||
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
|
p -> Optional
|
||||||
|
.ofNullable(p.getQualifier())
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.map(String::toLowerCase)
|
||||||
|
.orElse(""))
|
||||||
|
.orElse("");
|
||||||
|
return Optional
|
||||||
|
.ofNullable(pid)
|
||||||
|
.map(StructuredProperty::getValue)
|
||||||
|
.map(v -> String.join("|", v, classId))
|
||||||
|
.orElse("");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int countAuthorsPids(List<Author> authors) {
|
public static int countAuthorsPids(List<Author> authors) {
|
||||||
|
|
|
@ -135,7 +135,7 @@ public class GroupEntitiesSparkJob {
|
||||||
.applyCoarVocabularies(entity, vocs),
|
.applyCoarVocabularies(entity, vocs),
|
||||||
OAFENTITY_KRYO_ENC)
|
OAFENTITY_KRYO_ENC)
|
||||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeGroup, OAFENTITY_KRYO_ENC)
|
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||||
t.getClass().getName(), t),
|
t.getClass().getName(), t),
|
||||||
|
|
|
@ -30,8 +30,16 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class MergeUtils {
|
public class MergeUtils {
|
||||||
|
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
||||||
|
return mergeGroup(s, oafEntityIterator, true);
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
||||||
|
return mergeGroup(s, oafEntityIterator, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
||||||
|
boolean checkDelegateAuthority) {
|
||||||
TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
|
TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
|
||||||
|
@ -52,18 +60,22 @@ public class MergeUtils {
|
||||||
sortedEntities.add(oafEntityIterator.next());
|
sortedEntities.add(oafEntityIterator.next());
|
||||||
}
|
}
|
||||||
|
|
||||||
T merged = sortedEntities.descendingIterator().next();
|
|
||||||
|
|
||||||
Iterator<T> it = sortedEntities.descendingIterator();
|
Iterator<T> it = sortedEntities.descendingIterator();
|
||||||
|
T merged = it.next();
|
||||||
|
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
merged = checkedMerge(merged, it.next());
|
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||||
}
|
}
|
||||||
|
|
||||||
return merged;
|
return merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T checkedMerge(final T left, final T right) {
|
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
||||||
return (T) merge(left, right, false);
|
return (T) merge(left, right, checkDelegateAuthority);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Result, E extends Result> Result mergeResult(final T left, final E right) {
|
||||||
|
return (Result) merge(left, right, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Oaf merge(final Oaf left, final Oaf right) {
|
public static Oaf merge(final Oaf left, final Oaf right) {
|
||||||
|
@ -108,7 +120,7 @@ public class MergeUtils {
|
||||||
return mergeSoftware((Software) left, (Software) right);
|
return mergeSoftware((Software) left, (Software) right);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mergeResult((Result) left, (Result) right);
|
return mergeResultFields((Result) left, (Result) right);
|
||||||
} else if (sameClass(left, right, Datasource.class)) {
|
} else if (sameClass(left, right, Datasource.class)) {
|
||||||
// TODO
|
// TODO
|
||||||
final int trust = compareTrust(left, right);
|
final int trust = compareTrust(left, right);
|
||||||
|
@ -151,9 +163,9 @@ public class MergeUtils {
|
||||||
}
|
}
|
||||||
// TODO: raise trust to have preferred fields from one or the other??
|
// TODO: raise trust to have preferred fields from one or the other??
|
||||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||||
return mergeResult(left, right);
|
return mergeResultFields(left, right);
|
||||||
} else {
|
} else {
|
||||||
return mergeResult(right, left);
|
return mergeResultFields(right, left);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -263,6 +275,12 @@ public class MergeUtils {
|
||||||
|
|
||||||
// TODO review
|
// TODO review
|
||||||
private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
|
private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
|
||||||
|
if (left == null) {
|
||||||
|
return right;
|
||||||
|
} else if (right == null) {
|
||||||
|
return left;
|
||||||
|
}
|
||||||
|
|
||||||
if (trust < 0) {
|
if (trust < 0) {
|
||||||
List<KeyValue> s = left;
|
List<KeyValue> s = left;
|
||||||
left = right;
|
left = right;
|
||||||
|
@ -270,8 +288,9 @@ public class MergeUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
HashMap<String, KeyValue> values = new HashMap<>();
|
HashMap<String, KeyValue> values = new HashMap<>();
|
||||||
left.forEach(kv -> values.put(kv.getKey(), kv));
|
|
||||||
right.forEach(kv -> values.putIfAbsent(kv.getKey(), kv));
|
Optional.ofNullable(left).ifPresent(l -> l.forEach(kv -> values.put(kv.getKey(), kv)));
|
||||||
|
Optional.ofNullable(right).ifPresent(r -> r.forEach(kv -> values.putIfAbsent(kv.getKey(), kv)));
|
||||||
|
|
||||||
return new ArrayList<>(values.values());
|
return new ArrayList<>(values.values());
|
||||||
}
|
}
|
||||||
|
@ -367,7 +386,7 @@ public class MergeUtils {
|
||||||
return merge;
|
return merge;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Result> T mergeResult(T original, T enrich) {
|
private static <T extends Result> T mergeResultFields(T original, T enrich) {
|
||||||
final int trust = compareTrust(original, enrich);
|
final int trust = compareTrust(original, enrich);
|
||||||
T merge = mergeOafEntityFields(original, enrich, trust);
|
T merge = mergeOafEntityFields(original, enrich, trust);
|
||||||
|
|
||||||
|
@ -693,7 +712,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
|
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
|
||||||
int trust = compareTrust(original, enrich);
|
int trust = compareTrust(original, enrich);
|
||||||
final T merge = mergeResult(original, enrich);
|
final T merge = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
|
merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
|
||||||
merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
|
merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
|
||||||
|
@ -704,7 +723,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
private static <T extends Software> T mergeSoftware(T original, T enrich) {
|
private static <T extends Software> T mergeSoftware(T original, T enrich) {
|
||||||
int trust = compareTrust(original, enrich);
|
int trust = compareTrust(original, enrich);
|
||||||
final T merge = mergeResult(original, enrich);
|
final T merge = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
|
merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
|
||||||
merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
|
merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
|
||||||
|
@ -718,7 +737,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
private static <T extends Dataset> T mergeDataset(T original, T enrich) {
|
private static <T extends Dataset> T mergeDataset(T original, T enrich) {
|
||||||
int trust = compareTrust(original, enrich);
|
int trust = compareTrust(original, enrich);
|
||||||
T merge = mergeResult(original, enrich);
|
T merge = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
|
merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
|
||||||
merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
|
merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
|
||||||
|
@ -737,7 +756,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
public static <T extends Publication> T mergePublication(T original, T enrich) {
|
public static <T extends Publication> T mergePublication(T original, T enrich) {
|
||||||
final int trust = compareTrust(original, enrich);
|
final int trust = compareTrust(original, enrich);
|
||||||
T merged = mergeResult(original, enrich);
|
T merged = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));
|
merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));
|
||||||
|
|
||||||
|
@ -855,9 +874,11 @@ public class MergeUtils {
|
||||||
if (toEnrichInstances == null) {
|
if (toEnrichInstances == null) {
|
||||||
return enrichmentResult;
|
return enrichmentResult;
|
||||||
}
|
}
|
||||||
if (enrichmentInstances == null) {
|
|
||||||
return enrichmentResult;
|
if (enrichmentInstances == null || enrichmentInstances.isEmpty()) {
|
||||||
|
return toEnrichInstances;
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Instance> ri = toInstanceMap(enrichmentInstances);
|
Map<String, Instance> ri = toInstanceMap(enrichmentInstances);
|
||||||
|
|
||||||
toEnrichInstances.forEach(i -> {
|
toEnrichInstances.forEach(i -> {
|
||||||
|
|
|
@ -36,6 +36,15 @@ public class ResultTypeComparator implements Comparator<Result> {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
|
||||||
|
if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
String lClass = left.getResulttype().getClassid();
|
String lClass = left.getResulttype().getClassid();
|
||||||
String rClass = right.getResulttype().getClassid();
|
String rClass = right.getResulttype().getClassid();
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,101 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.common.base.Splitter;
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set of common functions for the framework
|
||||||
|
*
|
||||||
|
* @author claudio
|
||||||
|
*/
|
||||||
|
public class PaceCommonUtils {
|
||||||
|
|
||||||
|
// transliterator
|
||||||
|
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||||
|
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||||
|
|
||||||
|
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||||
|
|
||||||
|
protected static String fixAliases(final String s) {
|
||||||
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
s.chars().forEach(ch -> {
|
||||||
|
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||||
|
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||||
|
});
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static String transliterate(final String s) {
|
||||||
|
try {
|
||||||
|
return transliterator.transliterate(s);
|
||||||
|
} catch (Exception e) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalize(final String s) {
|
||||||
|
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||||
|
.toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||||
|
// strings
|
||||||
|
.replaceAll("[^ \\w]+", "")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String unicodeNormalization(final String s) {
|
||||||
|
|
||||||
|
Matcher m = hexUnicodePattern.matcher(s);
|
||||||
|
StringBuffer buf = new StringBuffer(s.length());
|
||||||
|
while (m.find()) {
|
||||||
|
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||||
|
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||||
|
}
|
||||||
|
m.appendTail(buf);
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<String> loadFromClasspath(final String classpath) {
|
||||||
|
|
||||||
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
final Set<String> h = Sets.newHashSet();
|
||||||
|
try {
|
||||||
|
for (final String s : IOUtils
|
||||||
|
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||||
|
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||||
|
}
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
return Sets.newHashSet();
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||||
|
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -12,7 +12,7 @@ import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.PaceCommonUtils;
|
||||||
import eu.dnetlib.pace.util.Capitalise;
|
import eu.dnetlib.pace.util.Capitalise;
|
||||||
import eu.dnetlib.pace.util.DotAbbreviations;
|
import eu.dnetlib.pace.util.DotAbbreviations;
|
||||||
|
|
||||||
|
@ -86,7 +86,7 @@ public class Person {
|
||||||
|
|
||||||
private List<String> splitTerms(final String s) {
|
private List<String> splitTerms(final String s) {
|
||||||
if (particles == null) {
|
if (particles == null) {
|
||||||
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<String> list = Lists.newArrayList();
|
final List<String> list = Lists.newArrayList();
|
|
@ -15,4 +15,4 @@ public class Capitalise implements Function<String, String> {
|
||||||
public String apply(final String s) {
|
public String apply(final String s) {
|
||||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||||
}
|
}
|
||||||
};
|
}
|
|
@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
|
||||||
public String apply(String s) {
|
public String apply(String s) {
|
||||||
return s.length() == 1 ? s + "." : s;
|
return s.length() == 1 ? s + "." : s;
|
||||||
}
|
}
|
||||||
};
|
}
|
|
@ -154,5 +154,13 @@
|
||||||
"unknown":{
|
"unknown":{
|
||||||
"original":"Unknown",
|
"original":"Unknown",
|
||||||
"inverse":"Unknown"
|
"inverse":"Unknown"
|
||||||
|
},
|
||||||
|
"isamongtopnsimilardocuments": {
|
||||||
|
"original": "IsAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "HasAmongTopNSimilarDocuments"
|
||||||
|
},
|
||||||
|
"hasamongtopnsimilardocuments": {
|
||||||
|
"original": "HasAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "IsAmongTopNSimilarDocuments"
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
log.info(s"Creating Spark session: Master: $master")
|
log.info(s"Creating Spark session: Master: $master")
|
||||||
SparkSession
|
val b = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(master)
|
if (master != null)
|
||||||
.getOrCreate()
|
b.master(master)
|
||||||
|
b.getOrCreate()
|
||||||
}
|
}
|
||||||
|
|
||||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||||
|
|
|
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
val sum = ScholixUtils.resultToSummary(r)
|
||||||
|
if (sum != null)
|
||||||
|
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||||
|
else
|
||||||
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||||
|
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def invRel(rel: String): String = {
|
||||||
|
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
|
||||||
|
if (semanticRelation != null)
|
||||||
|
semanticRelation.inverse
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||||
|
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
|
||||||
if (persistentIdentifiers.isEmpty)
|
if (persistentIdentifiers.isEmpty)
|
||||||
return null
|
return null
|
||||||
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
||||||
if (r.isInstanceOf[Publication])
|
// s.setTypology(r.getResulttype.getClassid)
|
||||||
s.setTypology(Typology.publication)
|
|
||||||
else
|
|
||||||
s.setTypology(Typology.dataset)
|
|
||||||
|
|
||||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ public class MergeUtilsTest {
|
||||||
assertEquals(1, d1.getCollectedfrom().size());
|
assertEquals(1, d1.getCollectedfrom().size());
|
||||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
|
|
||||||
final Result p1d2 = MergeUtils.checkedMerge(p1, d2);
|
final Result p1d2 = MergeUtils.checkedMerge(p1, d2, true);
|
||||||
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
|
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
|
||||||
assertTrue(p1d2 instanceof Publication);
|
assertTrue(p1d2 instanceof Publication);
|
||||||
assertEquals(p1.getId(), p1d2.getId());
|
assertEquals(p1.getId(), p1d2.getId());
|
||||||
|
@ -74,7 +74,7 @@ public class MergeUtilsTest {
|
||||||
Publication p2 = read("publication_2.json", Publication.class);
|
Publication p2 = read("publication_2.json", Publication.class);
|
||||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||||
|
|
||||||
final Result p2d1 = MergeUtils.checkedMerge(p2, d1);
|
final Result p2d1 = MergeUtils.checkedMerge(p2, d1, true);
|
||||||
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
|
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
|
||||||
assertTrue(p2d1 instanceof Dataset);
|
assertTrue(p2d1 instanceof Dataset);
|
||||||
assertEquals(d1.getId(), p2d1.getId());
|
assertEquals(d1.getId(), p2d1.getId());
|
||||||
|
@ -86,7 +86,7 @@ public class MergeUtilsTest {
|
||||||
Publication p1 = read("publication_1.json", Publication.class);
|
Publication p1 = read("publication_1.json", Publication.class);
|
||||||
Publication p2 = read("publication_2.json", Publication.class);
|
Publication p2 = read("publication_2.json", Publication.class);
|
||||||
|
|
||||||
Result p1p2 = MergeUtils.checkedMerge(p1, p2);
|
Result p1p2 = MergeUtils.checkedMerge(p1, p2, true);
|
||||||
assertTrue(p1p2 instanceof Publication);
|
assertTrue(p1p2 instanceof Publication);
|
||||||
assertEquals(p1.getId(), p1p2.getId());
|
assertEquals(p1.getId(), p1p2.getId());
|
||||||
assertEquals(2, p1p2.getCollectedfrom().size());
|
assertEquals(2, p1p2.getCollectedfrom().size());
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
<phase>initialize</phase>
|
<phase>process-resources</phase>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>add-source</goal>
|
<goal>add-source</goal>
|
||||||
<goal>compile</goal>
|
<goal>compile</goal>
|
||||||
|
@ -49,18 +49,16 @@
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>edu.cmu</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>secondstring</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.guava</groupId>
|
|
||||||
<artifactId>guava</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.code.gson</groupId>
|
|
||||||
<artifactId>gson</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -85,10 +83,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.commons</groupId>
|
|
||||||
<artifactId>commons-math3</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
|
@ -107,4 +101,90 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-24</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-2</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-2</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-35</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-35</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.pace.common;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
@ -14,19 +13,15 @@ import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import com.ibm.icu.text.Transliterator;
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions for the framework
|
* Set of common functions for the framework
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
public class AbstractPaceFunctions {
|
public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
|
|
||||||
// city map to be used when translating the city names into codes
|
// city map to be used when translating the city names into codes
|
||||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||||
|
@ -41,9 +36,6 @@ public class AbstractPaceFunctions {
|
||||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||||
|
|
||||||
// transliterator
|
|
||||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
|
||||||
|
|
||||||
// blacklist of ngrams: to avoid generic keys
|
// blacklist of ngrams: to avoid generic keys
|
||||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||||
|
|
||||||
|
@ -51,8 +43,6 @@ public class AbstractPaceFunctions {
|
||||||
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
||||||
|
|
||||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
|
||||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
|
||||||
|
|
||||||
// doi prefix for normalization
|
// doi prefix for normalization
|
||||||
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
||||||
|
@ -129,25 +119,6 @@ public class AbstractPaceFunctions {
|
||||||
return numberPattern.matcher(strNum).matches();
|
return numberPattern.matcher(strNum).matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static String fixAliases(final String s) {
|
|
||||||
final StringBuilder sb = new StringBuilder();
|
|
||||||
|
|
||||||
s.chars().forEach(ch -> {
|
|
||||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
|
||||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
|
||||||
});
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static String transliterate(final String s) {
|
|
||||||
try {
|
|
||||||
return transliterator.transliterate(s);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static String removeSymbols(final String s) {
|
protected static String removeSymbols(final String s) {
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
@ -162,23 +133,6 @@ public class AbstractPaceFunctions {
|
||||||
return s != null;
|
return s != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String normalize(final String s) {
|
|
||||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
|
||||||
.toLowerCase()
|
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
|
||||||
// strings
|
|
||||||
.replaceAll("[^ \\w]+", "")
|
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
|
||||||
.replaceAll("(\\d)+", " ")
|
|
||||||
.replaceAll("(\\n)+", " ")
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String nfd(final String s) {
|
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String utf8(final String s) {
|
public static String utf8(final String s) {
|
||||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||||
return new String(bytes, StandardCharsets.UTF_8);
|
return new String(bytes, StandardCharsets.UTF_8);
|
||||||
|
@ -233,22 +187,6 @@ public class AbstractPaceFunctions {
|
||||||
return newset;
|
return newset;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Set<String> loadFromClasspath(final String classpath) {
|
|
||||||
|
|
||||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
|
||||||
|
|
||||||
final Set<String> h = Sets.newHashSet();
|
|
||||||
try {
|
|
||||||
for (final String s : IOUtils
|
|
||||||
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
|
||||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
|
||||||
}
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
return Sets.newHashSet();
|
|
||||||
}
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||||
|
|
||||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
@ -303,10 +241,6 @@ public class AbstractPaceFunctions {
|
||||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
|
||||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String normalizePid(String pid) {
|
public static String normalizePid(String pid) {
|
||||||
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
|
||||||
import com.jayway.jsonpath.{Configuration, JsonPath}
|
import com.jayway.jsonpath.{Configuration, JsonPath}
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
||||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil
|
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
||||||
import org.apache.commons.lang3.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||||
|
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||||
|
|
||||||
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||||
df.map(r => rowFromJson(r))(RowEncoder(schema))
|
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||||
}
|
}
|
||||||
|
|
||||||
def rowFromJson(json: String): Row = {
|
def rowFromJson(json: String): Row = {
|
||||||
|
|
|
@ -1,19 +1,20 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
|
||||||
import eu.dnetlib.pace.util.AuthorMatchers;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
import eu.dnetlib.pace.util.AuthorMatchers;
|
||||||
|
|
||||||
@ComparatorClass("authorsMatch")
|
@ComparatorClass("authorsMatch")
|
||||||
public class AuthorsMatch extends AbstractListComparator {
|
public class AuthorsMatch extends AbstractListComparator {
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.pace.util
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
object SparkCompatUtils {
|
||||||
|
|
||||||
|
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||||
|
RowEncoder(schema)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.pace.util
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
object SparkCompatUtils {
|
||||||
|
|
||||||
|
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||||
|
ExpressionEncoder(schema)
|
||||||
|
}
|
||||||
|
}
|
|
@ -7,6 +7,7 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
@ -22,7 +23,7 @@ public class UtilTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Disabled
|
||||||
public void paceResolverTest() {
|
public void paceResolverTest() {
|
||||||
PaceResolver paceResolver = new PaceResolver();
|
PaceResolver paceResolver = new PaceResolver();
|
||||||
paceResolver.getComparator("keywordMatch", params);
|
paceResolver.getComparator("keywordMatch", params);
|
||||||
|
|
|
@ -0,0 +1,169 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp</artifactId>
|
||||||
|
<version>1.2.5-SNAPSHOT</version>
|
||||||
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>dhp-shade-package</artifactId>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||||
|
</site>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
|
<description>This module create a jar of all module dependencies</description>
|
||||||
|
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-actionmanager</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-aggregation</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-blacklist</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-broker-events</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-graph-mapper</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-graph-provision</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-impact-indicators</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-actionsets</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-promote</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-update</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-swh</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-usage-raw-data-update</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-usage-stats-build</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-shade-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>shade</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<transformers>
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||||
|
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
|
||||||
|
</transformer>
|
||||||
|
<!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
||||||
|
<resource>META-INF/cxf/bus-extensions.txt</resource>
|
||||||
|
</transformer>
|
||||||
|
</transformers>
|
||||||
|
<filters>
|
||||||
|
<filter>
|
||||||
|
<artifact>*:*</artifact>
|
||||||
|
<excludes>
|
||||||
|
<exclude>META-INF/maven/**</exclude>
|
||||||
|
<exclude>META-INF/*.SF</exclude>
|
||||||
|
<exclude>META-INF/*.DSA</exclude>
|
||||||
|
<exclude>META-INF/*.RSA</exclude>
|
||||||
|
</excludes>
|
||||||
|
</filter>
|
||||||
|
</filters>
|
||||||
|
<relocations>
|
||||||
|
<relocation>
|
||||||
|
<pattern>com</pattern>
|
||||||
|
<shadedPattern>repackaged.com.google.common</shadedPattern>
|
||||||
|
<includes>
|
||||||
|
<include>com.google.common.**</include>
|
||||||
|
</includes>
|
||||||
|
</relocation>
|
||||||
|
</relocations>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -9,6 +9,7 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.union(openAPCRelations)
|
.union(openAPCRelations)
|
||||||
.union(dataciteRelations)
|
.union(dataciteRelations)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
resultsRDD
|
resultsRDD
|
||||||
.union(projectsRDD)
|
.union(projectsRDD)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -80,9 +80,11 @@ public class PrepareFOSSparkJob implements Serializable {
|
||||||
|
|
||||||
fosDataset
|
fosDataset
|
||||||
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING())
|
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
|
.mapGroups(
|
||||||
return getResult(ModelSupport.getIdPrefix(Result.class) + "|" + k, it);
|
(MapGroupsFunction<String, FOSDataModel, Result>) (k,
|
||||||
}, Encoders.bean(Result.class))
|
it) -> getResult(
|
||||||
|
ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k, it),
|
||||||
|
Encoders.bean(Result.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
|
@ -93,7 +93,7 @@ public class CreateActionSetSparkJob implements Serializable {
|
||||||
.filter((FilterFunction<Relation>) Objects::nonNull)
|
.filter((FilterFunction<Relation>) Objects::nonNull)
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.map(p -> new AtomicAction(p.getClass(), p));
|
.map(p -> new AtomicAction(p.getClass(), p));
|
||||||
//TODO relations in stand-by waiting to know if we need to create them or not In case we need just make a union before saving the sequence file
|
|
||||||
spark
|
spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(inputPath)
|
.textFile(inputPath)
|
||||||
|
@ -108,6 +108,7 @@ public class CreateActionSetSparkJob implements Serializable {
|
||||||
.filter((FilterFunction<Result>) r -> r != null)
|
.filter((FilterFunction<Result>) r -> r != null)
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.map(p -> new AtomicAction(p.getClass(), p))
|
.map(p -> new AtomicAction(p.getClass(), p))
|
||||||
|
.union(relations)
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||||
|
|
|
@ -0,0 +1,251 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.apache.spark.sql.types.StructType;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 18/04/24
|
||||||
|
*/
|
||||||
|
public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(CreateActionSetFromWebEntries.class);
|
||||||
|
private static final String DOI_PREFIX = "50|doi_________::";
|
||||||
|
|
||||||
|
private static final String ROR_PREFIX = "20|ror_________::";
|
||||||
|
|
||||||
|
private static final String PMID_PREFIX = "50|pmid________::";
|
||||||
|
|
||||||
|
private static final String PMCID_PREFIX = "50|pmc_________::";
|
||||||
|
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
||||||
|
private static final String WEB_CRAWL_NAME = "Web Crawl";
|
||||||
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
CreateActionSetFromWebEntries.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/as_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String blackListInputPath = parser.get("blackListPath");
|
||||||
|
log.info("blackListInputPath: {}", blackListInputPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
|
||||||
|
createActionSet(spark, inputPath, outputPath, blackListInputPath);
|
||||||
|
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void createActionSet(SparkSession spark, String inputPath,
|
||||||
|
String outputPath, String blackListInputPath) {
|
||||||
|
|
||||||
|
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
||||||
|
.filter("country_code=='IE'")
|
||||||
|
.drop("publication_year");
|
||||||
|
|
||||||
|
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
|
||||||
|
|
||||||
|
dataset
|
||||||
|
.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
|
||||||
|
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
|
||||||
|
.drop("OpenAlexId")
|
||||||
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||||
|
List<Relation> ret = new ArrayList<>();
|
||||||
|
final String ror = ROR_PREFIX
|
||||||
|
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
|
||||||
|
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
|
||||||
|
ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
|
||||||
|
ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
|
||||||
|
|
||||||
|
return ret
|
||||||
|
.iterator();
|
||||||
|
}, Encoders.bean(Relation.class))
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(p -> new AtomicAction(p.getClass(), p))
|
||||||
|
.mapToPair(
|
||||||
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||||
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||||
|
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<Row> readWebCrawl(SparkSession spark, String inputPath) {
|
||||||
|
StructType webInfo = StructType
|
||||||
|
.fromDDL(
|
||||||
|
"`id` STRING , `doi` STRING, `ids` STRUCT<`pmid` :STRING, `pmcid`: STRING >, `publication_year` STRING, "
|
||||||
|
+
|
||||||
|
"`authorships` ARRAY<STRUCT <`institutions`: ARRAY <STRUCT <`ror`: STRING, `country_code` :STRING>>>>");
|
||||||
|
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.schema(webInfo)
|
||||||
|
.json(inputPath)
|
||||||
|
.withColumn(
|
||||||
|
"authors", functions
|
||||||
|
.explode(
|
||||||
|
functions.col("authorships")))
|
||||||
|
.selectExpr("id", "doi", "ids", "publication_year", "authors.institutions as institutions")
|
||||||
|
.withColumn(
|
||||||
|
"institution", functions
|
||||||
|
.explode(
|
||||||
|
functions.col("institutions")))
|
||||||
|
.selectExpr(
|
||||||
|
"id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
|
||||||
|
"institution.country_code as country_code", "publication_year")
|
||||||
|
.distinct();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
|
||||||
|
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.option("header", true)
|
||||||
|
.csv(inputPath)
|
||||||
|
.select("OpenAlexId");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
|
||||||
|
if (pmcid == null)
|
||||||
|
return new ArrayList<>();
|
||||||
|
|
||||||
|
return createAffiliatioRelationPair(
|
||||||
|
PMCID_PREFIX
|
||||||
|
+ IdentifierFactory
|
||||||
|
.md5(PidCleaner.normalizePidValue(PidType.pmc.toString(), removeResolver("PMC", pmcid))),
|
||||||
|
ror);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Relation> createAffiliationRelationPairPMID(String pmid, String ror) {
|
||||||
|
if (pmid == null)
|
||||||
|
return new ArrayList<>();
|
||||||
|
|
||||||
|
return createAffiliatioRelationPair(
|
||||||
|
PMID_PREFIX
|
||||||
|
+ IdentifierFactory
|
||||||
|
.md5(PidCleaner.normalizePidValue(PidType.pmid.toString(), removeResolver("PMID", pmid))),
|
||||||
|
ror);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String removeResolver(String pidType, String pid) {
|
||||||
|
switch (pidType) {
|
||||||
|
case "PMID":
|
||||||
|
return pid.substring(33);
|
||||||
|
case "PMC":
|
||||||
|
return "PMC" + pid.substring(43);
|
||||||
|
case "DOI":
|
||||||
|
return pid.substring(16);
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new RuntimeException();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Relation> createAffiliationRelationPairDOI(String doi, String ror) {
|
||||||
|
if (doi == null)
|
||||||
|
return new ArrayList<>();
|
||||||
|
|
||||||
|
return createAffiliatioRelationPair(
|
||||||
|
DOI_PREFIX
|
||||||
|
+ IdentifierFactory
|
||||||
|
.md5(PidCleaner.normalizePidValue(PidType.doi.toString(), removeResolver("DOI", doi))),
|
||||||
|
ror);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Relation> createAffiliatioRelationPair(String resultId, String orgId) {
|
||||||
|
ArrayList<Relation> newRelations = new ArrayList();
|
||||||
|
|
||||||
|
newRelations
|
||||||
|
.add(
|
||||||
|
OafMapperUtils
|
||||||
|
.getRelation(
|
||||||
|
orgId, resultId, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION,
|
||||||
|
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
||||||
|
Arrays
|
||||||
|
.asList(
|
||||||
|
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
||||||
|
OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false, null, false, false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
"sysimport:crasswalk:webcrawl", "Imported from Webcrawl",
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
"0.9"),
|
||||||
|
null));
|
||||||
|
|
||||||
|
newRelations
|
||||||
|
.add(
|
||||||
|
OafMapperUtils
|
||||||
|
.getRelation(
|
||||||
|
resultId, orgId, ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION,
|
||||||
|
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||||
|
Arrays
|
||||||
|
.asList(
|
||||||
|
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
||||||
|
OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false, null, false, false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
"sysimport:crasswalk:webcrawl", "Imported from Webcrawl",
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
"0.9"),
|
||||||
|
null));
|
||||||
|
|
||||||
|
return newRelations;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Spliterator;
|
import java.util.Spliterator;
|
||||||
import java.util.Spliterators;
|
import java.util.Spliterators;
|
||||||
|
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
final String entityXpath = api.getParams().get("entityXpath");
|
final String entityXpath = api.getParams().get("entityXpath");
|
||||||
final String authMethod = api.getParams().get("authMethod");
|
final String authMethod = api.getParams().get("authMethod");
|
||||||
final String authToken = api.getParams().get("authToken");
|
final String authToken = api.getParams().get("authToken");
|
||||||
|
final String requestHeaderMap = api.getParams().get("requestHeaderMap");
|
||||||
|
Gson gson = new Gson();
|
||||||
|
Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
|
||||||
final String resultSizeValue = Optional
|
final String resultSizeValue = Optional
|
||||||
.ofNullable(api.getParams().get("resultSizeValue"))
|
.ofNullable(api.getParams().get("resultSizeValue"))
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
if (StringUtils.isBlank(resultFormatValue)) {
|
if (StringUtils.isBlank(resultFormatValue)) {
|
||||||
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
||||||
}
|
}
|
||||||
if (StringUtils.isBlank(queryParams)) {
|
|
||||||
throw new CollectorException("Param 'queryParams' is null or empty");
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(entityXpath)) {
|
if (StringUtils.isBlank(entityXpath)) {
|
||||||
throw new CollectorException("Param 'entityXpath' is null or empty");
|
throw new CollectorException("Param 'entityXpath' is null or empty");
|
||||||
}
|
}
|
||||||
|
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
entityXpath,
|
entityXpath,
|
||||||
authMethod,
|
authMethod,
|
||||||
authToken,
|
authToken,
|
||||||
resultOutputFormat);
|
resultOutputFormat,
|
||||||
|
requestHeaders);
|
||||||
|
|
||||||
return StreamSupport
|
return StreamSupport
|
||||||
.stream(
|
.stream(
|
||||||
|
|
|
@ -9,6 +9,7 @@ import java.net.URL;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
|
@ -18,7 +19,11 @@ import javax.xml.transform.TransformerConfigurationException;
|
||||||
import javax.xml.transform.TransformerFactory;
|
import javax.xml.transform.TransformerFactory;
|
||||||
import javax.xml.transform.dom.DOMSource;
|
import javax.xml.transform.dom.DOMSource;
|
||||||
import javax.xml.transform.stream.StreamResult;
|
import javax.xml.transform.stream.StreamResult;
|
||||||
import javax.xml.xpath.*;
|
import javax.xml.xpath.XPath;
|
||||||
|
import javax.xml.xpath.XPathConstants;
|
||||||
|
import javax.xml.xpath.XPathExpression;
|
||||||
|
import javax.xml.xpath.XPathExpressionException;
|
||||||
|
import javax.xml.xpath.XPathFactory;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -30,12 +35,14 @@ import org.w3c.dom.Node;
|
||||||
import org.w3c.dom.NodeList;
|
import org.w3c.dom.NodeList;
|
||||||
import org.xml.sax.InputSource;
|
import org.xml.sax.InputSource;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* log.info(...) equal to log.trace(...) in the application-logs
|
* log.info(...) equal to log.trace(...) in the application-logs
|
||||||
* <p>
|
* <p>
|
||||||
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
|
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
|
||||||
*
|
*
|
||||||
|
@ -47,10 +54,11 @@ public class RestIterator implements Iterator<String> {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
||||||
public static final String UTF_8 = "UTF-8";
|
public static final String UTF_8 = "UTF-8";
|
||||||
|
private static final int MAX_ATTEMPTS = 5;
|
||||||
|
|
||||||
private final HttpClientParams clientParams;
|
private final HttpClientParams clientParams;
|
||||||
|
|
||||||
private final String BASIC = "basic";
|
private final String AUTHBASIC = "basic";
|
||||||
|
|
||||||
private final String baseUrl;
|
private final String baseUrl;
|
||||||
private final String resumptionType;
|
private final String resumptionType;
|
||||||
|
@ -60,8 +68,9 @@ public class RestIterator implements Iterator<String> {
|
||||||
private final int resultSizeValue;
|
private final int resultSizeValue;
|
||||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||||
private int resultTotal = -1;
|
private int resultTotal = -1;
|
||||||
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
|
private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
|
||||||
// or token scanned from results)
|
// harvest
|
||||||
|
// or token scanned from results)
|
||||||
private InputStream resultStream;
|
private InputStream resultStream;
|
||||||
private Transformer transformer;
|
private Transformer transformer;
|
||||||
private XPath xpath;
|
private XPath xpath;
|
||||||
|
@ -73,7 +82,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
private final String querySize;
|
private final String querySize;
|
||||||
private final String authMethod;
|
private final String authMethod;
|
||||||
private final String authToken;
|
private final String authToken;
|
||||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
||||||
private int discoverResultSize = 0;
|
private int discoverResultSize = 0;
|
||||||
private int pagination = 1;
|
private int pagination = 1;
|
||||||
/*
|
/*
|
||||||
|
@ -83,8 +92,13 @@ public class RestIterator implements Iterator<String> {
|
||||||
*/
|
*/
|
||||||
private final String resultOutputFormat;
|
private final String resultOutputFormat;
|
||||||
|
|
||||||
/** RestIterator class
|
/*
|
||||||
* compatible to version 1.3.33
|
* Can be used to set additional request headers, like for content negotiation
|
||||||
|
*/
|
||||||
|
private Map<String, String> requestHeaders;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RestIterator class compatible to version 1.3.33
|
||||||
*/
|
*/
|
||||||
public RestIterator(
|
public RestIterator(
|
||||||
final HttpClientParams clientParams,
|
final HttpClientParams clientParams,
|
||||||
|
@ -101,47 +115,51 @@ public class RestIterator implements Iterator<String> {
|
||||||
final String entityXpath,
|
final String entityXpath,
|
||||||
final String authMethod,
|
final String authMethod,
|
||||||
final String authToken,
|
final String authToken,
|
||||||
final String resultOutputFormat) {
|
final String resultOutputFormat,
|
||||||
|
final Map<String, String> requestHeaders) {
|
||||||
|
|
||||||
this.clientParams = clientParams;
|
this.clientParams = clientParams;
|
||||||
this.baseUrl = baseUrl;
|
this.baseUrl = baseUrl;
|
||||||
this.resumptionType = resumptionType;
|
this.resumptionType = resumptionType;
|
||||||
this.resumptionParam = resumptionParam;
|
this.resumptionParam = resumptionParam;
|
||||||
this.resultFormatValue = resultFormatValue;
|
this.resultFormatValue = resultFormatValue;
|
||||||
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
|
this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
|
||||||
this.queryParams = queryParams;
|
this.queryParams = queryParams;
|
||||||
this.authMethod = authMethod;
|
this.authMethod = authMethod;
|
||||||
this.authToken = authToken;
|
this.authToken = authToken;
|
||||||
this.resultOutputFormat = resultOutputFormat;
|
this.resultOutputFormat = resultOutputFormat;
|
||||||
|
this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
|
||||||
|
|
||||||
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||||
|
: "";
|
||||||
|
this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
|
||||||
: "";
|
: "";
|
||||||
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
||||||
} catch (Exception e) {
|
} catch (final Exception e) {
|
||||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
initQueue();
|
initQueue();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
|
private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
|
||||||
|
final String entityXpath)
|
||||||
throws TransformerConfigurationException, XPathExpressionException {
|
throws TransformerConfigurationException, XPathExpressionException {
|
||||||
final TransformerFactory factory = TransformerFactory.newInstance();
|
final TransformerFactory factory = TransformerFactory.newInstance();
|
||||||
transformer = factory.newTransformer();
|
this.transformer = factory.newTransformer();
|
||||||
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||||
xpath = XPathFactory.newInstance().newXPath();
|
this.xpath = XPathFactory.newInstance().newXPath();
|
||||||
xprResultTotalPath = xpath.compile(resultTotalXpath);
|
this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
|
||||||
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||||
xprEntity = xpath.compile(entityXpath);
|
this.xprEntity = this.xpath.compile(entityXpath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initQueue() {
|
private void initQueue() {
|
||||||
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat;
|
||||||
log.info("REST calls starting with {}", query);
|
log.info("REST calls starting with {}", this.query);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void disconnect() {
|
private void disconnect() {
|
||||||
|
@ -154,12 +172,11 @@ public class RestIterator implements Iterator<String> {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
if (recordQueue.isEmpty() && query.isEmpty()) {
|
if (this.recordQueue.isEmpty() && this.query.isEmpty()) {
|
||||||
disconnect();
|
disconnect();
|
||||||
return false;
|
return false;
|
||||||
} else {
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -168,214 +185,237 @@ public class RestIterator implements Iterator<String> {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public String next() {
|
public String next() {
|
||||||
synchronized (recordQueue) {
|
synchronized (this.recordQueue) {
|
||||||
while (recordQueue.isEmpty() && !query.isEmpty()) {
|
while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
|
||||||
try {
|
try {
|
||||||
query = downloadPage(query);
|
this.query = downloadPage(this.query, 0);
|
||||||
} catch (CollectorException e) {
|
} catch (final CollectorException e) {
|
||||||
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return recordQueue.poll();
|
return this.recordQueue.poll();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* download page and return nextQuery
|
* download page and return nextQuery (with number of attempt)
|
||||||
*/
|
*/
|
||||||
private String downloadPage(String query) throws CollectorException {
|
private String downloadPage(String query, final int attempt) throws CollectorException {
|
||||||
String resultJson;
|
|
||||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
|
||||||
String nextQuery = "";
|
|
||||||
String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
|
||||||
Node resultNode = null;
|
|
||||||
NodeList nodeList = null;
|
|
||||||
String qUrlArgument = "";
|
|
||||||
int urlOldResumptionSize = 0;
|
|
||||||
InputStream theHttpInputStream;
|
|
||||||
|
|
||||||
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
if (attempt > MAX_ATTEMPTS) {
|
||||||
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
|
throw new CollectorException("Max Number of attempts reached, query:" + query);
|
||||||
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
}
|
||||||
if (!query.contains("&cursor=")) {
|
|
||||||
query += "&cursor=*";
|
if (attempt > 0) {
|
||||||
|
final int delay = (attempt * 5000);
|
||||||
|
log.debug("Attempt {} with delay {}", attempt, delay);
|
||||||
|
try {
|
||||||
|
Thread.sleep(delay);
|
||||||
|
} catch (final InterruptedException e) {
|
||||||
|
new CollectorException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
log.info("requestig URL [{}]", query);
|
String resultJson;
|
||||||
|
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||||
|
String nextQuery = "";
|
||||||
|
final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
||||||
|
Node resultNode = null;
|
||||||
|
NodeList nodeList = null;
|
||||||
|
String qUrlArgument = "";
|
||||||
|
int urlOldResumptionSize = 0;
|
||||||
|
InputStream theHttpInputStream;
|
||||||
|
|
||||||
URL qUrl = new URL(query);
|
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||||
log.debug("authMethod: {}", authMethod);
|
if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
|
||||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
||||||
log.trace("authMethod before inputStream: {}", resultXml);
|
if (!query.contains("&cursor=")) {
|
||||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
query += "&cursor=*";
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
|
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
|
||||||
conn.setRequestMethod("GET");
|
|
||||||
theHttpInputStream = conn.getInputStream();
|
|
||||||
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
|
|
||||||
log.trace("authMethod before inputStream: {}", resultXml);
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
|
|
||||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
|
||||||
conn.setRequestMethod("GET");
|
|
||||||
theHttpInputStream = conn.getInputStream();
|
|
||||||
} else {
|
|
||||||
theHttpInputStream = qUrl.openStream();
|
|
||||||
}
|
|
||||||
|
|
||||||
resultStream = theHttpInputStream;
|
|
||||||
if ("json".equals(resultOutputFormat)) {
|
|
||||||
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
|
||||||
resultXml = JsonUtils.convertToXML(resultJson);
|
|
||||||
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
|
||||||
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
|
|
||||||
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
|
||||||
log.debug("nodeList.length: {}", nodeList.getLength());
|
|
||||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
|
||||||
StringWriter sw = new StringWriter();
|
|
||||||
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
|
||||||
String toEnqueue = sw.toString();
|
|
||||||
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
|
|
||||||
log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
|
|
||||||
} else {
|
|
||||||
recordQueue.add(sw.toString());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
log.warn("resultXml is equal with emptyXml");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
resumptionInt += resultSizeValue;
|
try {
|
||||||
|
log.info("requesting URL [{}]", query);
|
||||||
|
|
||||||
switch (resumptionType.toLowerCase()) {
|
final URL qUrl = new URL(query);
|
||||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
log.debug("authMethod: {}", this.authMethod);
|
||||||
resumptionStr = xprResumptionPath.evaluate(resultNode);
|
if (this.authMethod == "bearer") {
|
||||||
break;
|
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
|
||||||
|
requestHeaders.put("Authorization", "Bearer " + authToken);
|
||||||
|
// requestHeaders.put("Content-Type", "application/json");
|
||||||
|
} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
|
||||||
|
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
|
||||||
|
requestHeaders.put("Authorization", "Basic " + authToken);
|
||||||
|
// requestHeaders.put("accept", "application/xml");
|
||||||
|
}
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
this.setRequestHeader(conn);
|
||||||
|
resultStream = conn.getInputStream();
|
||||||
|
|
||||||
case "count": // begin at one step for all records, iterate over items
|
if ("json".equals(this.resultOutputFormat)) {
|
||||||
resumptionStr = Integer.toString(resumptionInt);
|
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
|
||||||
break;
|
resultXml = JsonUtils.convertToXML(resultJson);
|
||||||
|
this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
||||||
if (resultSizeValue < 2) {
|
resultNode = (Node) this.xpath
|
||||||
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
|
||||||
|
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||||
|
log.debug("nodeList.length: {}", nodeList.getLength());
|
||||||
|
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||||
|
final StringWriter sw = new StringWriter();
|
||||||
|
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||||
|
final String toEnqueue = sw.toString();
|
||||||
|
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue)
|
||||||
|
|| emptyXml.equalsIgnoreCase(toEnqueue)) {
|
||||||
|
log
|
||||||
|
.warn(
|
||||||
|
"The following record resulted in empty item for the feeding queue: {}", resultXml);
|
||||||
|
} else {
|
||||||
|
this.recordQueue.add(sw.toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
qUrlArgument = qUrl.getQuery();
|
} else {
|
||||||
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
log.warn("resultXml is equal with emptyXml");
|
||||||
for (String arrayUrlArgStr : arrayQUrlArgument) {
|
}
|
||||||
if (arrayUrlArgStr.startsWith(resumptionParam)) {
|
|
||||||
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
this.resumptionInt += this.resultSizeValue;
|
||||||
if (isInteger(resumptionKeyValue[1])) {
|
|
||||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
switch (this.resumptionType.toLowerCase()) {
|
||||||
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||||
} else {
|
this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
|
||||||
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
|
break;
|
||||||
|
|
||||||
|
case "count": // begin at one step for all records, iterate over items
|
||||||
|
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||||
|
if (this.resultSizeValue < 2) {
|
||||||
|
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||||
|
}
|
||||||
|
qUrlArgument = qUrl.getQuery();
|
||||||
|
final String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||||
|
for (final String arrayUrlArgStr : arrayQUrlArgument) {
|
||||||
|
if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
|
||||||
|
final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||||
|
if (isInteger(resumptionKeyValue[1])) {
|
||||||
|
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||||
|
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
||||||
|
} else {
|
||||||
|
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (((emptyXml).equalsIgnoreCase(resultXml))
|
if (((emptyXml).equalsIgnoreCase(resultXml))
|
||||||
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
|
|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
|
||||||
// resumptionStr = "";
|
// resumptionStr = "";
|
||||||
if (nodeList != null) {
|
if (nodeList != null) {
|
||||||
discoverResultSize += nodeList.getLength();
|
this.discoverResultSize += nodeList.getLength();
|
||||||
|
}
|
||||||
|
this.resultTotal = this.discoverResultSize;
|
||||||
|
} else {
|
||||||
|
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||||
|
this.resultTotal = this.resumptionInt + 1;
|
||||||
|
if (nodeList != null) {
|
||||||
|
this.discoverResultSize += nodeList.getLength();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
resultTotal = discoverResultSize;
|
log.info("discoverResultSize: {}", this.discoverResultSize);
|
||||||
} else {
|
break;
|
||||||
resumptionStr = Integer.toString(resumptionInt);
|
|
||||||
resultTotal = resumptionInt + 1;
|
case "pagination":
|
||||||
|
case "page": // pagination, iterate over page numbers
|
||||||
|
this.pagination += 1;
|
||||||
if (nodeList != null) {
|
if (nodeList != null) {
|
||||||
discoverResultSize += nodeList.getLength();
|
this.discoverResultSize += nodeList.getLength();
|
||||||
|
} else {
|
||||||
|
this.resultTotal = this.discoverResultSize;
|
||||||
|
this.pagination = this.discoverResultSize;
|
||||||
}
|
}
|
||||||
}
|
this.resumptionInt = this.pagination;
|
||||||
log.info("discoverResultSize: {}", discoverResultSize);
|
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "pagination":
|
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
|
||||||
case "page": // pagination, iterate over page numbers
|
// in
|
||||||
pagination += 1;
|
// solr)
|
||||||
if (nodeList != null) {
|
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||||
discoverResultSize += nodeList.getLength();
|
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||||
} else {
|
|
||||||
resultTotal = discoverResultSize;
|
|
||||||
pagination = discoverResultSize;
|
|
||||||
}
|
|
||||||
resumptionInt = pagination;
|
|
||||||
resumptionStr = Integer.toString(resumptionInt);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
|
this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
|
||||||
// solr)
|
this.queryParams = this.queryParams.replace("&cursor=*", "");
|
||||||
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
|
||||||
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
|
||||||
|
|
||||||
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
|
// terminating if length of nodeList is 0
|
||||||
queryParams = queryParams.replace("&cursor=*", "");
|
if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
|
||||||
|
this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
|
||||||
|
} else {
|
||||||
|
this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
|
||||||
|
// resultSizeValue
|
||||||
|
// because the iteration is over
|
||||||
|
// real length and the
|
||||||
|
// resultSizeValue is added before
|
||||||
|
// the switch()
|
||||||
|
}
|
||||||
|
|
||||||
// terminating if length of nodeList is 0
|
this.discoverResultSize = nodeList.getLength();
|
||||||
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
|
|
||||||
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
|
|
||||||
} else {
|
|
||||||
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
|
|
||||||
// because the iteration is over
|
|
||||||
// real length and the
|
|
||||||
// resultSizeValue is added before
|
|
||||||
// the switch()
|
|
||||||
}
|
|
||||||
|
|
||||||
discoverResultSize = nodeList.getLength();
|
log
|
||||||
|
.debug(
|
||||||
|
"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
|
||||||
|
+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);
|
||||||
|
|
||||||
log
|
break;
|
||||||
.debug(
|
|
||||||
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
|
|
||||||
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
|
|
||||||
|
|
||||||
break;
|
default: // otherwise: abort
|
||||||
|
// resultTotal = resumptionInt;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
default: // otherwise: abort
|
} catch (final Exception e) {
|
||||||
// resultTotal = resumptionInt;
|
log.error(e.getMessage(), e);
|
||||||
break;
|
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (Exception e) {
|
try {
|
||||||
log.error(e.getMessage(), e);
|
if (this.resultTotal == -1) {
|
||||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
|
||||||
}
|
if ("page".equalsIgnoreCase(this.resumptionType)
|
||||||
|
&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
|
||||||
try {
|
this.resultTotal += 1;
|
||||||
if (resultTotal == -1) {
|
} // to correct the upper bound
|
||||||
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
|
log.info("resultTotal was -1 is now: " + this.resultTotal);
|
||||||
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
|
}
|
||||||
resultTotal += 1;
|
} catch (final Exception e) {
|
||||||
} // to correct the upper bound
|
log.error(e.getMessage(), e);
|
||||||
log.info("resultTotal was -1 is now: " + resultTotal);
|
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
log.debug("resultTotal: " + this.resultTotal);
|
||||||
log.error(e.getMessage(), e);
|
log.debug("resInt: " + this.resumptionInt);
|
||||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
if (this.resumptionInt <= this.resultTotal) {
|
||||||
|
nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
|
||||||
|
+ this.resumptionStr
|
||||||
|
+ this.queryFormat;
|
||||||
|
} else {
|
||||||
|
nextQuery = "";
|
||||||
|
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||||
|
// resumptionInt and prevent a NullPointer Exception at mdStore
|
||||||
|
}
|
||||||
|
log.debug("nextQueryUrl: " + nextQuery);
|
||||||
|
return nextQuery;
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
log.warn(e.getMessage(), e);
|
||||||
|
return downloadPage(query, attempt + 1);
|
||||||
}
|
}
|
||||||
log.debug("resultTotal: " + resultTotal);
|
|
||||||
log.debug("resInt: " + resumptionInt);
|
|
||||||
if (resumptionInt <= resultTotal) {
|
|
||||||
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
|
|
||||||
+ queryFormat;
|
|
||||||
} else {
|
|
||||||
nextQuery = "";
|
|
||||||
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
|
||||||
// resumptionInt and prevent a NullPointer Exception at mdStore
|
|
||||||
}
|
|
||||||
log.debug("nextQueryUrl: " + nextQuery);
|
|
||||||
return nextQuery;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isInteger(String s) {
|
private boolean isInteger(final String s) {
|
||||||
boolean isValidInteger = false;
|
boolean isValidInteger = false;
|
||||||
try {
|
try {
|
||||||
Integer.parseInt(s);
|
Integer.parseInt(s);
|
||||||
|
@ -383,7 +423,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
// s is a valid integer
|
// s is a valid integer
|
||||||
|
|
||||||
isValidInteger = true;
|
isValidInteger = true;
|
||||||
} catch (NumberFormatException ex) {
|
} catch (final NumberFormatException ex) {
|
||||||
// s is not an integer
|
// s is not an integer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -391,20 +431,36 @@ public class RestIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Method to encode a string value using `UTF-8` encoding scheme
|
// Method to encode a string value using `UTF-8` encoding scheme
|
||||||
private String encodeValue(String value) {
|
private String encodeValue(final String value) {
|
||||||
try {
|
try {
|
||||||
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
||||||
} catch (UnsupportedEncodingException ex) {
|
} catch (final UnsupportedEncodingException ex) {
|
||||||
throw new RuntimeException(ex.getCause());
|
throw new RuntimeException(ex.getCause());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* setRequestHeader
|
||||||
|
*
|
||||||
|
* setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
|
||||||
|
* @param conn
|
||||||
|
*/
|
||||||
|
private void setRequestHeader(HttpURLConnection conn) {
|
||||||
|
if (requestHeaders != null) {
|
||||||
|
for (String key : requestHeaders.keySet()) {
|
||||||
|
conn.setRequestProperty(key, requestHeaders.get(key));
|
||||||
|
}
|
||||||
|
log.debug("Set Request Header with: " + requestHeaders);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public String getResultFormatValue() {
|
public String getResultFormatValue() {
|
||||||
return resultFormatValue;
|
return this.resultFormatValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getResultOutputFormat() {
|
public String getResultOutputFormat() {
|
||||||
return resultOutputFormat;
|
return this.resultOutputFormat;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,10 @@ import java.io.StringWriter;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import javax.xml.stream.XMLEventFactory;
|
import javax.xml.stream.XMLEventFactory;
|
||||||
import javax.xml.stream.XMLEventReader;
|
import javax.xml.stream.XMLEventReader;
|
||||||
|
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
|
||||||
import javax.xml.stream.events.StartElement;
|
import javax.xml.stream.events.StartElement;
|
||||||
import javax.xml.stream.events.XMLEvent;
|
import javax.xml.stream.events.XMLEvent;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {
|
||||||
|
|
||||||
private String element;
|
private String element;
|
||||||
|
|
||||||
|
private List<String> elements;
|
||||||
|
|
||||||
private InputStream inputStream;
|
private InputStream inputStream;
|
||||||
|
|
||||||
public XMLIterator(final String element, final InputStream inputStream) {
|
public XMLIterator(final String element, final InputStream inputStream) {
|
||||||
super();
|
super();
|
||||||
this.element = element;
|
this.element = element;
|
||||||
|
if (element.contains(",")) {
|
||||||
|
elements = Arrays
|
||||||
|
.stream(element.split(","))
|
||||||
|
.filter(StringUtils::isNoneBlank)
|
||||||
|
.map(String::toLowerCase)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
this.inputStream = inputStream;
|
this.inputStream = inputStream;
|
||||||
this.parser = getParser();
|
this.parser = getParser();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
this.current = findElement(parser);
|
this.current = findElement(parser);
|
||||||
} catch (XMLStreamException e) {
|
} catch (XMLStreamException e) {
|
||||||
|
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
|
||||||
final XMLEvent event = parser.nextEvent();
|
final XMLEvent event = parser.nextEvent();
|
||||||
|
|
||||||
// TODO: replace with depth tracking instead of close tag tracking.
|
// TODO: replace with depth tracking instead of close tag tracking.
|
||||||
if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
|
if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
|
||||||
writer.add(event);
|
writer.add(event);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -142,18 +156,16 @@ public class XMLIterator implements Iterator<String> {
|
||||||
XMLEvent peek = parser.peek();
|
XMLEvent peek = parser.peek();
|
||||||
if (peek != null && peek.isStartElement()) {
|
if (peek != null && peek.isStartElement()) {
|
||||||
String name = peek.asStartElement().getName().getLocalPart();
|
String name = peek.asStartElement().getName().getLocalPart();
|
||||||
if (element.equals(name)) {
|
if (isCheckTag(name))
|
||||||
return peek;
|
return peek;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while (parser.hasNext()) {
|
while (parser.hasNext()) {
|
||||||
final XMLEvent event = parser.nextEvent();
|
XMLEvent event = parser.nextEvent();
|
||||||
if (event != null && event.isStartElement()) {
|
if (event != null && event.isStartElement()) {
|
||||||
String name = event.asStartElement().getName().getLocalPart();
|
String name = event.asStartElement().getName().getLocalPart();
|
||||||
if (element.equals(name)) {
|
if (isCheckTag(name))
|
||||||
return event;
|
return event;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
|
@ -161,12 +173,31 @@ public class XMLIterator implements Iterator<String> {
|
||||||
|
|
||||||
private XMLEventReader getParser() {
|
private XMLEventReader getParser() {
|
||||||
try {
|
try {
|
||||||
return inputFactory.get().createXMLEventReader(sanitize(inputStream));
|
XMLInputFactory xif = inputFactory.get();
|
||||||
|
xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
|
||||||
|
return xif.createXMLEventReader(sanitize(inputStream));
|
||||||
} catch (XMLStreamException e) {
|
} catch (XMLStreamException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isCheckTag(final String tagName) {
|
||||||
|
if (elements != null) {
|
||||||
|
final String found = elements
|
||||||
|
.stream()
|
||||||
|
.filter(e -> e.equalsIgnoreCase(tagName))
|
||||||
|
.findFirst()
|
||||||
|
.orElse(null);
|
||||||
|
if (found != null)
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
if (element.equalsIgnoreCase(tagName)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private Reader sanitize(final InputStream in) {
|
private Reader sanitize(final InputStream in) {
|
||||||
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
|
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
|
||||||
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
|
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "sp",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the zipped opencitations file",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "op",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the working path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "issm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "the hdfs name node",
|
||||||
|
"paramRequired": false
|
||||||
|
},{
|
||||||
|
"paramName": "bl",
|
||||||
|
"paramLongName": "blackListPath",
|
||||||
|
"paramDescription": "the working path",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,3 @@
|
||||||
|
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
||||||
|
outputPath=/tmp/miriam/webcrawlComplete/
|
||||||
|
blackListPath=/user/miriam.baglioni/openalex-blackList
|
|
@ -0,0 +1,58 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorNumber</name>
|
||||||
|
<value>4</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<value>15G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<value>6G</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<value>1</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,54 @@
|
||||||
|
<workflow-app name="WebCrawl Integration" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="create_actionset"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="create_actionset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Produces the AS for WC</name>
|
||||||
|
<class>eu.dnetlib.dhp.actionmanager.webcrawl.CreateActionSetFromWebEntries</class>
|
||||||
|
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
|
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -271,12 +271,6 @@
|
||||||
"name": "An Roinn Sl\u00e1inte",
|
"name": "An Roinn Sl\u00e1inte",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "100018998",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/100018998",
|
|
||||||
"name": "Irish Research eLibrary",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "100019428",
|
"id": "100019428",
|
||||||
"uri": "http://dx.doi.org/10.13039/100019428",
|
"uri": "http://dx.doi.org/10.13039/100019428",
|
||||||
|
@ -631,12 +625,6 @@
|
||||||
"name": "Alimentary Health",
|
"name": "Alimentary Health",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100011103",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100011103",
|
|
||||||
"name": "Rann\u00eds",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100012354",
|
"id": "501100012354",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100012354",
|
"uri": "http://dx.doi.org/10.13039/501100012354",
|
||||||
|
|
|
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
|
||||||
tp._1 match {
|
tp._1 match {
|
||||||
case "electronic" => journal.setIssnOnline(tp._2)
|
case "electronic" => journal.setIssnOnline(tp._2)
|
||||||
case "print" => journal.setIssnPrinted(tp._2)
|
case "print" => journal.setIssnPrinted(tp._2)
|
||||||
|
case _ =>
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,17 @@ import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils._
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.{OafMapperUtils, PidType}
|
import eu.dnetlib.dhp.schema.oaf.utils.{OafMapperUtils, PidType}
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, Journal, Organization, Publication, Relation, Result, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{
|
||||||
|
Author,
|
||||||
|
DataInfo,
|
||||||
|
Instance,
|
||||||
|
Journal,
|
||||||
|
Organization,
|
||||||
|
Publication,
|
||||||
|
Relation,
|
||||||
|
Result,
|
||||||
|
Dataset => OafDataset
|
||||||
|
}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.spark.sql.types._
|
import org.apache.spark.sql.types._
|
||||||
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||||
|
@ -69,23 +79,6 @@ object MagUtility extends Serializable {
|
||||||
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
|
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
|
||||||
|
|
||||||
private val MAGDataInfo: DataInfo = {
|
private val MAGDataInfo: DataInfo = {
|
||||||
val di = new DataInfo
|
|
||||||
di.setDeletedbyinference(false)
|
|
||||||
di.setInferred(false)
|
|
||||||
di.setInvisible(false)
|
|
||||||
di.setTrust("0.9")
|
|
||||||
di.setProvenanceaction(
|
|
||||||
OafMapperUtils.qualifier(
|
|
||||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
||||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS
|
|
||||||
)
|
|
||||||
)
|
|
||||||
di
|
|
||||||
}
|
|
||||||
|
|
||||||
private val MAGDataInfoInvisible: DataInfo = {
|
|
||||||
val di = new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference(false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred(false)
|
di.setInferred(false)
|
||||||
|
@ -443,7 +436,6 @@ object MagUtility extends Serializable {
|
||||||
|
|
||||||
case "repository" =>
|
case "repository" =>
|
||||||
result = new Publication()
|
result = new Publication()
|
||||||
result.setDataInfo(MAGDataInfoInvisible)
|
|
||||||
qualifier(
|
qualifier(
|
||||||
"0038",
|
"0038",
|
||||||
"Other literature type",
|
"Other literature type",
|
||||||
|
@ -478,8 +470,7 @@ object MagUtility extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
if (result.getDataInfo == null)
|
result.setDataInfo(MAGDataInfo)
|
||||||
result.setDataInfo(MAGDataInfo)
|
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
i.setInstancetype(tp)
|
i.setInstancetype(tp)
|
||||||
i.setInstanceTypeMapping(
|
i.setInstanceTypeMapping(
|
||||||
|
@ -502,7 +493,7 @@ object MagUtility extends Serializable {
|
||||||
return null
|
return null
|
||||||
|
|
||||||
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||||
val pidList = List(
|
var pidList = List(
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
paper.paperId.get.toString,
|
paper.paperId.get.toString,
|
||||||
qualifier(
|
qualifier(
|
||||||
|
@ -515,8 +506,6 @@ object MagUtility extends Serializable {
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
result.setPid(pidList.asJava)
|
|
||||||
|
|
||||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||||
|
|
||||||
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
||||||
|
@ -608,22 +597,23 @@ object MagUtility extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
val instance = result.getInstance().get(0)
|
val instance = result.getInstance().get(0)
|
||||||
instance.setPid(pidList.asJava)
|
|
||||||
if (paper.doi.orNull != null)
|
if (paper.doi.orNull != null) {
|
||||||
instance.setAlternateIdentifier(
|
pidList = pidList ::: List(
|
||||||
List(
|
structuredProperty(
|
||||||
structuredProperty(
|
paper.doi.get,
|
||||||
paper.doi.get,
|
qualifier(
|
||||||
qualifier(
|
PidType.doi.toString,
|
||||||
PidType.doi.toString,
|
PidType.doi.toString,
|
||||||
PidType.doi.toString,
|
ModelConstants.DNET_PID_TYPES,
|
||||||
ModelConstants.DNET_PID_TYPES,
|
ModelConstants.DNET_PID_TYPES
|
||||||
ModelConstants.DNET_PID_TYPES
|
),
|
||||||
),
|
null
|
||||||
null
|
)
|
||||||
)
|
|
||||||
).asJava
|
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
instance.setPid(pidList.asJava)
|
||||||
|
result.setPid(pidList.asJava)
|
||||||
instance.setUrl(paper.urls.get.asJava)
|
instance.setUrl(paper.urls.get.asJava)
|
||||||
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
||||||
instance.setCollectedfrom(MAGCollectedFrom)
|
instance.setCollectedfrom(MAGCollectedFrom)
|
||||||
|
@ -688,33 +678,45 @@ object MagUtility extends Serializable {
|
||||||
o.setLegalname(field(r.getAs[String]("DisplayName"), null))
|
o.setLegalname(field(r.getAs[String]("DisplayName"), null))
|
||||||
val gid = r.getAs[String]("GridId")
|
val gid = r.getAs[String]("GridId")
|
||||||
if (gid != null) {
|
if (gid != null) {
|
||||||
o.setPid(List(
|
o.setPid(
|
||||||
structuredProperty(gid, qualifier(
|
List(
|
||||||
PidType.GRID.toString,
|
structuredProperty(
|
||||||
PidType.GRID.toString,
|
gid,
|
||||||
ModelConstants.DNET_PID_TYPES,
|
qualifier(
|
||||||
ModelConstants.DNET_PID_TYPES
|
PidType.GRID.toString,
|
||||||
),
|
PidType.GRID.toString,
|
||||||
null),
|
ModelConstants.DNET_PID_TYPES,
|
||||||
structuredProperty(r.getAs[Long]("AffiliationId").toString, qualifier(
|
ModelConstants.DNET_PID_TYPES
|
||||||
PidType.mag_id.toString,
|
),
|
||||||
PidType.mag_id.toString,
|
null
|
||||||
ModelConstants.DNET_PID_TYPES,
|
),
|
||||||
ModelConstants.DNET_PID_TYPES
|
structuredProperty(
|
||||||
),
|
r.getAs[Long]("AffiliationId").toString,
|
||||||
null)
|
qualifier(
|
||||||
|
PidType.mag_id.toString,
|
||||||
).asJava)
|
PidType.mag_id.toString,
|
||||||
|
ModelConstants.DNET_PID_TYPES,
|
||||||
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
),
|
||||||
|
null
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
} else {
|
} else {
|
||||||
o.setPid(List(
|
o.setPid(
|
||||||
structuredProperty(r.getAs[Long]("AffiliationId").toString, qualifier(
|
List(
|
||||||
PidType.mag_id.toString,
|
structuredProperty(
|
||||||
PidType.mag_id.toString,
|
r.getAs[Long]("AffiliationId").toString,
|
||||||
ModelConstants.DNET_PID_TYPES,
|
qualifier(
|
||||||
ModelConstants.DNET_PID_TYPES
|
PidType.mag_id.toString,
|
||||||
),
|
PidType.mag_id.toString,
|
||||||
null)
|
ModelConstants.DNET_PID_TYPES,
|
||||||
).asJava)
|
ModelConstants.DNET_PID_TYPES
|
||||||
|
),
|
||||||
|
null
|
||||||
|
)
|
||||||
|
).asJava
|
||||||
|
)
|
||||||
}
|
}
|
||||||
val c = r.getAs[String]("Iso3166Code")
|
val c = r.getAs[String]("Iso3166Code")
|
||||||
if (c != null)
|
if (c != null)
|
||||||
|
|
|
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
|
||||||
spark.read
|
spark.read
|
||||||
.load(s"$magBasePath/mag_denormalized")
|
.load(s"$magBasePath/mag_denormalized")
|
||||||
.as[MAGPaper]
|
.as[MAGPaper]
|
||||||
|
.filter(col("doi").isNotNull)
|
||||||
.map(s => MagUtility.convertMAGtoOAF(s))
|
.map(s => MagUtility.convertMAGtoOAF(s))
|
||||||
.filter(s => s != null)
|
.filter(s => s != null)
|
||||||
.write
|
.write
|
||||||
|
|
|
@ -6,33 +6,37 @@ import eu.dnetlib.dhp.schema.oaf.Organization
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
class SparkMagOrganizationAS (propertyPath: String, args: Array[String], log: Logger)
|
class SparkMagOrganizationAS(propertyPath: String, args: Array[String], log: Logger)
|
||||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
/** Here all the spark applications runs this method
|
/** Here all the spark applications runs this method
|
||||||
* where the whole logic of the spark node is defined
|
* where the whole logic of the spark node is defined
|
||||||
*/
|
*/
|
||||||
override def run(): Unit = {
|
override def run(): Unit = {
|
||||||
val magBasePath:String = parser.get("magBasePath")
|
val magBasePath: String = parser.get("magBasePath")
|
||||||
log.info(s"magBasePath is $magBasePath")
|
log.info(s"magBasePath is $magBasePath")
|
||||||
val outputPath:String = parser.get("outputPath")
|
val outputPath: String = parser.get("outputPath")
|
||||||
log.info(s"outputPath is $outputPath")
|
log.info(s"outputPath is $outputPath")
|
||||||
generateAS(spark,magBasePath, outputPath)
|
generateAS(spark, magBasePath, outputPath)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateAS(spark:SparkSession, magBasePath:String,outputPath:String ):Unit = {
|
def generateAS(spark: SparkSession, magBasePath: String, outputPath: String): Unit = {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
val organizations = MagUtility.loadMagEntity(spark,"Affiliations", magBasePath)
|
val organizations = MagUtility.loadMagEntity(spark, "Affiliations", magBasePath)
|
||||||
organizations.map(r => MagUtility.generateOrganization(r)).write.mode(SaveMode.Overwrite)
|
organizations
|
||||||
|
.map(r => MagUtility.generateOrganization(r))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.text(outputPath)
|
.text(outputPath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object SparkMagOrganizationAS{
|
object SparkMagOrganizationAS {
|
||||||
|
|
||||||
val log: Logger = LoggerFactory.getLogger(SparkMagOrganizationAS.getClass)
|
val log: Logger = LoggerFactory.getLogger(SparkMagOrganizationAS.getClass)
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
new SparkMagOrganizationAS("/eu/dnetlib/dhp/collection/mag/create_organization_AS.json", args, log)
|
new SparkMagOrganizationAS("/eu/dnetlib/dhp/collection/mag/create_organization_AS.json", args, log)
|
||||||
.initialize()
|
.initialize()
|
||||||
|
|
|
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
|
||||||
import org.apache.http.impl.client.HttpClientBuilder
|
import org.apache.http.impl.client.HttpClientBuilder
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import java.io.InputStream
|
import java.io.{ByteArrayInputStream, InputStream}
|
||||||
import scala.io.Source
|
import java.nio.charset.Charset
|
||||||
import scala.xml.pull.XMLEventReader
|
import javax.xml.stream.XMLInputFactory
|
||||||
|
|
||||||
object SparkCreateBaselineDataFrame {
|
object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
|
@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
tries -= 1
|
tries -= 1
|
||||||
} else
|
} else
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
println(s"Error on requesting ${r.getURI}")
|
println(s"Error on requesting ${r.getURI}")
|
||||||
|
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||||
)
|
),
|
||||||
|
Charset.defaultCharset()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
|
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
|
||||||
val workingPath = parser.get("workingPath")
|
val workingPath = parser.get("workingPath")
|
||||||
log.info("workingPath: {}", workingPath)
|
log.info("workingPath: {}", workingPath)
|
||||||
|
|
||||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
log.info("targetPath: {}", targetPath)
|
||||||
|
|
||||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
|
||||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
|
||||||
log.info("outputBasePath: {}", outputBasePath)
|
|
||||||
|
|
||||||
val hdfsServerUri = parser.get("hdfsServerUri")
|
val hdfsServerUri = parser.get("hdfsServerUri")
|
||||||
log.info("hdfsServerUri: {}", hdfsServerUri)
|
log.info("hdfsServerUri: {}", targetPath)
|
||||||
|
|
||||||
val skipUpdate = parser.get("skipUpdate")
|
val skipUpdate = parser.get("skipUpdate")
|
||||||
log.info("skipUpdate: {}", skipUpdate)
|
log.info("skipUpdate: {}", skipUpdate)
|
||||||
|
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
|
||||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||||
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
val ds: Dataset[PMArticle] = spark.createDataset(
|
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||||
k.filter(i => i._1.endsWith(".gz"))
|
k.filter(i => i._1.endsWith(".gz"))
|
||||||
.flatMap(i => {
|
.flatMap(i => {
|
||||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
|
||||||
new PMParser(xml)
|
new PMParser(xml)
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||||
.as[Oaf]
|
.as[Oaf]
|
||||||
.filter(p => p != null),
|
.filter(p => p != null),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
|
|
||||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
|
||||||
val mdStoreSize = df.count
|
|
||||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||||
|
|
||||||
import scala.xml.MetaData
|
import scala.xml.MetaData
|
||||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
import javax.xml.stream.XMLEventReader
|
||||||
|
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
|
||||||
|
|
||||||
/** @param xml
|
/** @param xml
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.AfterAll;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,9 @@ public class ReadCOCITest {
|
||||||
workingDir.toString() + "/COCI",
|
workingDir.toString() + "/COCI",
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/COCI_json/",
|
workingDir.toString() + "/COCI_json/",
|
||||||
"-inputFile", "input1;input2;input3;input4;input5"
|
"-inputFile", "input1;input2;input3;input4;input5",
|
||||||
|
"-format",
|
||||||
|
"COCI"
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
|
@ -0,0 +1,297 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.actionmanager.webcrawl;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 22/04/24
|
||||||
|
*/
|
||||||
|
public class CreateASTest {
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private static SparkSession spark;
|
||||||
|
|
||||||
|
private static Path workingDir;
|
||||||
|
private static final Logger log = LoggerFactory
|
||||||
|
.getLogger(CreateASTest.class);
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void beforeAll() throws IOException {
|
||||||
|
workingDir = Files
|
||||||
|
.createTempDirectory(CreateASTest.class.getSimpleName());
|
||||||
|
log.info("using work dir {}", workingDir);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.setAppName(CreateASTest.class.getSimpleName());
|
||||||
|
|
||||||
|
conf.setMaster("local[*]");
|
||||||
|
conf.set("spark.driver.host", "localhost");
|
||||||
|
conf.set("hive.metastore.local", "true");
|
||||||
|
conf.set("spark.ui.enabled", "false");
|
||||||
|
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||||
|
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||||
|
|
||||||
|
spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(CreateASTest.class.getSimpleName())
|
||||||
|
.config(conf)
|
||||||
|
.getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterAll
|
||||||
|
public static void afterAll() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(workingDir.toFile());
|
||||||
|
spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testNumberofRelations() throws Exception {
|
||||||
|
|
||||||
|
String inputPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
|
||||||
|
.getPath();
|
||||||
|
String blackListPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
CreateActionSetFromWebEntries
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged",
|
||||||
|
Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath",
|
||||||
|
inputPath,
|
||||||
|
"-outputPath",
|
||||||
|
workingDir.toString() + "/actionSet1",
|
||||||
|
"-blackListPath", blackListPath
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Relation> tmp = sc
|
||||||
|
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
|
||||||
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
|
Assertions.assertEquals(58, tmp.count());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testRelations() throws Exception {
|
||||||
|
|
||||||
|
// , "doi":"https://doi.org/10.1126/science.1188021", "pmid":"https://pubmed.ncbi.nlm.nih.gov/20448178", https://www.ncbi.nlm.nih.gov/pmc/articles/5100745
|
||||||
|
|
||||||
|
String inputPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
||||||
|
.getPath();
|
||||||
|
String blackListPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
CreateActionSetFromWebEntries
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged",
|
||||||
|
Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath",
|
||||||
|
inputPath,
|
||||||
|
"-outputPath",
|
||||||
|
workingDir.toString() + "/actionSet1",
|
||||||
|
"-blackListPath", blackListPath
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Relation> tmp = sc
|
||||||
|
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
|
||||||
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
|
tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getSource()
|
||||||
|
.equals(
|
||||||
|
"50|doi_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue(PidType.doi.toString(), "10.1098/rstl.1684.0023"))))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getTarget()
|
||||||
|
.equals(
|
||||||
|
"50|doi_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue(PidType.doi.toString(), "10.1098/rstl.1684.0023"))))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getSource()
|
||||||
|
.equals(
|
||||||
|
"20|ror_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue("ROR", "https://ror.org/03argrj65"))))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getTarget()
|
||||||
|
.equals(
|
||||||
|
"20|ror_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue("ROR", "https://ror.org/03argrj65"))))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
2, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getSource()
|
||||||
|
.equals(
|
||||||
|
"20|ror_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue("ROR", "https://ror.org/03265fv13"))))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
2, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getTarget()
|
||||||
|
.equals(
|
||||||
|
"20|ror_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue("ROR", "https://ror.org/03265fv13"))))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getTarget()
|
||||||
|
.equals(
|
||||||
|
"20|ror_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
|
||||||
|
&& r.getSource().startsWith("50|doi"))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
1, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getTarget()
|
||||||
|
.equals(
|
||||||
|
"20|ror_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
|
||||||
|
&& r.getSource().startsWith("50|pmid"))
|
||||||
|
.count());
|
||||||
|
|
||||||
|
Assertions
|
||||||
|
.assertEquals(
|
||||||
|
0, tmp
|
||||||
|
.filter(
|
||||||
|
r -> r
|
||||||
|
.getTarget()
|
||||||
|
.equals(
|
||||||
|
"20|ror_________::" + IdentifierFactory
|
||||||
|
.md5(
|
||||||
|
PidCleaner
|
||||||
|
.normalizePidValue(PidType.doi.toString(), "https://ror.org/03265fv13")))
|
||||||
|
&& r.getSource().startsWith("50|pmc"))
|
||||||
|
.count());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testRelationsCollectedFrom() throws Exception {
|
||||||
|
|
||||||
|
String inputPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
CreateActionSetFromWebEntries
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"-isSparkSessionManaged",
|
||||||
|
Boolean.FALSE.toString(),
|
||||||
|
"-sourcePath",
|
||||||
|
inputPath,
|
||||||
|
"-outputPath",
|
||||||
|
workingDir.toString() + "/actionSet1"
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Relation> tmp = sc
|
||||||
|
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
|
||||||
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
|
tmp.foreach(r -> {
|
||||||
|
assertEquals("Web Crawl", r.getCollectedfrom().get(0).getValue());
|
||||||
|
assertEquals("10|openaire____::fb98a192f6a055ba495ef414c330834b", r.getCollectedfrom().get(0).getKey());
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.file;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
|
||||||
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
public class FileGZipMultipleNodeTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
|
||||||
|
|
||||||
|
private final ApiDescriptor api = new ApiDescriptor();
|
||||||
|
|
||||||
|
private FileGZipCollectorPlugin plugin;
|
||||||
|
|
||||||
|
private static final String SPLIT_ON_ELEMENT = "incollection,article";
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
|
||||||
|
final String gzipFile = Objects
|
||||||
|
.requireNonNull(
|
||||||
|
this
|
||||||
|
.getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
|
||||||
|
.getFile();
|
||||||
|
|
||||||
|
api.setBaseUrl(gzipFile);
|
||||||
|
|
||||||
|
HashMap<String, String> params = new HashMap<>();
|
||||||
|
params.put("splitOnElement", SPLIT_ON_ELEMENT);
|
||||||
|
|
||||||
|
api.setParams(params);
|
||||||
|
|
||||||
|
FileSystem fs = FileSystem.get(new Configuration());
|
||||||
|
plugin = new FileGZipCollectorPlugin(fs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void test() throws CollectorException {
|
||||||
|
|
||||||
|
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
|
||||||
|
|
||||||
|
stream.limit(10).forEach(s -> {
|
||||||
|
Assertions.assertTrue(s.length() > 0);
|
||||||
|
log.info(s);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,7 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
@ -37,8 +39,8 @@ public class OsfPreprintCollectorTest {
|
||||||
private final String resumptionType = "page";
|
private final String resumptionType = "page";
|
||||||
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
|
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
|
||||||
|
|
||||||
private final String resultSizeParam = "";
|
private final String resultSizeParam = "page[size]";
|
||||||
private final String resultSizeValue = "";
|
private final String resultSizeValue = "100";
|
||||||
|
|
||||||
private final String resultFormatParam = "format";
|
private final String resultFormatParam = "format";
|
||||||
private final String resultFormatValue = "json";
|
private final String resultFormatValue = "json";
|
||||||
|
@ -68,11 +70,11 @@ public class OsfPreprintCollectorTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
void test() throws CollectorException {
|
void test_limited() throws CollectorException {
|
||||||
final AtomicInteger i = new AtomicInteger(0);
|
final AtomicInteger i = new AtomicInteger(0);
|
||||||
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||||
|
|
||||||
stream.limit(200).forEach(s -> {
|
stream.limit(2000).forEach(s -> {
|
||||||
Assertions.assertTrue(s.length() > 0);
|
Assertions.assertTrue(s.length() > 0);
|
||||||
i.incrementAndGet();
|
i.incrementAndGet();
|
||||||
log.info(s);
|
log.info(s);
|
||||||
|
@ -81,4 +83,23 @@ public class OsfPreprintCollectorTest {
|
||||||
log.info("{}", i.intValue());
|
log.info("{}", i.intValue());
|
||||||
Assertions.assertTrue(i.intValue() > 0);
|
Assertions.assertTrue(i.intValue() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
void test_all() throws CollectorException {
|
||||||
|
final AtomicLong i = new AtomicLong(0);
|
||||||
|
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||||
|
|
||||||
|
stream.forEach(s -> {
|
||||||
|
Assertions.assertTrue(s.length() > 0);
|
||||||
|
if ((i.incrementAndGet() % 1000) == 0) {
|
||||||
|
log.info("COLLECTED: {}", i.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
log.info("TOTAL: {}", i.get());
|
||||||
|
Assertions.assertTrue(i.get() > 0);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
@ -25,18 +32,18 @@ class RestCollectorPluginTest {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
|
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
|
||||||
|
|
||||||
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
|
private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
|
||||||
private final String resumptionType = "count";
|
private final String resumptionType = "discover";
|
||||||
private final String resumptionParam = "from";
|
private final String resumptionParam = "skip";
|
||||||
private final String entityXpath = "//hits/hits";
|
private final String entityXpath = "//*[local-name()='data']";
|
||||||
private final String resumptionXpath = "//hits";
|
private final String resumptionXpath = "";
|
||||||
private final String resultTotalXpath = "//hits/total";
|
private final String resultTotalXpath = "//*[local-name()='count']";
|
||||||
private final String resultFormatParam = "format";
|
private final String resultFormatParam = "";
|
||||||
private final String resultFormatValue = "json";
|
private final String resultFormatValue = "json";
|
||||||
private final String resultSizeParam = "size";
|
private final String resultSizeParam = "top";
|
||||||
private final String resultSizeValue = "10";
|
private final String resultSizeValue = "10";
|
||||||
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
||||||
private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
|
private final String query = "";
|
||||||
// private String query = "=(sources:engrXiv AND type:preprint)";
|
// private String query = "=(sources:engrXiv AND type:preprint)";
|
||||||
|
|
||||||
private final String protocolDescriptor = "rest_json2xml";
|
private final String protocolDescriptor = "rest_json2xml";
|
||||||
|
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
|
||||||
params.put("resultSizeValue", resultSizeValue);
|
params.put("resultSizeValue", resultSizeValue);
|
||||||
params.put("queryParams", query);
|
params.put("queryParams", query);
|
||||||
params.put("entityXpath", entityXpath);
|
params.put("entityXpath", entityXpath);
|
||||||
|
params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
|
||||||
|
|
||||||
api.setBaseUrl(baseUrl);
|
api.setBaseUrl(baseUrl);
|
||||||
api.setParams(params);
|
api.setParams(params);
|
||||||
|
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
|
||||||
log.info("{}", i.intValue());
|
log.info("{}", i.intValue());
|
||||||
Assertions.assertTrue(i.intValue() > 0);
|
Assertions.assertTrue(i.intValue() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testUrl() throws IOException {
|
||||||
|
String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
|
||||||
|
URL url = new URL(url_s);
|
||||||
|
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
conn.setRequestProperty("User-Agent", "OpenAIRE");
|
||||||
|
Gson gson = new Gson();
|
||||||
|
System.out.println("Request header");
|
||||||
|
System.out.println(gson.toJson(conn.getHeaderFields()));
|
||||||
|
InputStream inputStream = conn.getInputStream();
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class RestIteratorTest {
|
||||||
|
|
||||||
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
|
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
|
||||||
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
|
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
|
||||||
query, entityXpath, authMethod, authToken, resultOffsetParam);
|
query, entityXpath, authMethod, authToken, resultOffsetParam, null);
|
||||||
int i = 20;
|
int i = 20;
|
||||||
while (iterator.hasNext() && i > 0) {
|
while (iterator.hasNext() && i > 0) {
|
||||||
String result = iterator.next();
|
String result = iterator.next();
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -789,10 +789,6 @@
|
||||||
"value": "2227-9717",
|
"value": "2227-9717",
|
||||||
"type": "electronic"
|
"type": "electronic"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"value": "VALUE",
|
|
||||||
"type": "PIPPO"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"value": "1063-4584",
|
"value": "1063-4584",
|
||||||
"type": "pu"
|
"type": "pu"
|
||||||
|
|
Binary file not shown.
|
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||||
import org.junit.jupiter.api.BeforeEach
|
import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||||
import org.junit.jupiter.api.extension.ExtendWith
|
import org.junit.jupiter.api.extension.ExtendWith
|
||||||
import org.mockito.junit.jupiter.MockitoExtension
|
import org.mockito.junit.jupiter.MockitoExtension
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
|
||||||
super.setUpVocabulary()
|
super.setUpVocabulary()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def mappingRecord(): Unit = {
|
||||||
|
val input =
|
||||||
|
IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
|
||||||
|
|
||||||
|
println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
|
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
|
||||||
import org.apache.spark.sql.SparkSession
|
import org.apache.spark.sql.SparkSession
|
||||||
|
import org.apache.spark.sql.functions.col
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
|
@ -10,7 +11,6 @@ class MAGMappingTest {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
|
||||||
def mappingTest(): Unit = {
|
def mappingTest(): Unit = {
|
||||||
|
|
||||||
val spark = SparkSession
|
val spark = SparkSession
|
||||||
|
@ -19,10 +19,8 @@ class MAGMappingTest {
|
||||||
.master("local[*]")
|
.master("local[*]")
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
||||||
val s = new SparkMagOrganizationAS(null, null, null)
|
val s = new SparkMAGtoOAF(null, null, null)
|
||||||
|
s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
|
||||||
s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
|
||||||
|
|
||||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||||
import java.util.zip.GZIPInputStream
|
import java.util.zip.GZIPInputStream
|
||||||
|
import javax.xml.stream.XMLInputFactory
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.collection.mutable.ListBuffer
|
import scala.collection.mutable.ListBuffer
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
val inputXML = Source
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
.mkString
|
|
||||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
|
||||||
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testParsingPubmedXML(): Unit = {
|
def testParsingPubmedXML(): Unit = {
|
||||||
val xml = new XMLEventReader(
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
|
||||||
)
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
|
|
||||||
val parser = new PMParser(xml)
|
val parser = new PMParser(xml)
|
||||||
parser.foreach(checkPMArticle)
|
parser.foreach(checkPMArticle)
|
||||||
}
|
}
|
||||||
|
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
@Test
|
@Test
|
||||||
def testPubmedMapping(): Unit = {
|
def testPubmedMapping(): Unit = {
|
||||||
|
|
||||||
val xml = new XMLEventReader(
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
)
|
|
||||||
val parser = new PMParser(xml)
|
val parser = new PMParser(xml)
|
||||||
val results = ListBuffer[Oaf]()
|
val results = ListBuffer[Oaf]()
|
||||||
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
|
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
|
||||||
|
|
|
@ -38,7 +38,6 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
|
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
@ -54,24 +53,10 @@
|
||||||
<artifactId>dhp-pace-core</artifactId>
|
<artifactId>dhp-pace-core</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
|
||||||
<artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
|
|
||||||
<version>1.0.2</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
|
||||||
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
|
|
||||||
<version>2.11.0</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
|
@ -80,16 +65,10 @@
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.arakelian</groupId>
|
|
||||||
<artifactId>java-jq</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>dom4j</groupId>
|
<groupId>dom4j</groupId>
|
||||||
<artifactId>dom4j</artifactId>
|
<artifactId>dom4j</artifactId>
|
||||||
|
@ -102,10 +81,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
|
||||||
<artifactId>jackson-core</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
|
|
|
@ -189,7 +189,7 @@ public class DedupRecordFactory {
|
||||||
entity = swap;
|
entity = swap;
|
||||||
}
|
}
|
||||||
|
|
||||||
entity = MergeUtils.checkedMerge(entity, duplicate);
|
entity = MergeUtils.checkedMerge(entity, duplicate, false);
|
||||||
|
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
Result re = (Result) entity;
|
Result re = (Result) entity;
|
||||||
|
|
|
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.util.SparkCompatUtils;
|
||||||
import scala.Tuple3;
|
import scala.Tuple3;
|
||||||
import scala.collection.JavaConversions;
|
import scala.collection.JavaConversions;
|
||||||
|
|
||||||
|
@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
Dataset<Row> pivotHistory = spark
|
Dataset<Row> pivotHistory = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
RowEncoder
|
SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
|
||||||
.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
|
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
|
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
|
||||||
pivotHistory = spark
|
pivotHistory = spark
|
||||||
|
@ -175,6 +175,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
}
|
}
|
||||||
|
|
||||||
// cap pidType at w3id as from there on they are considered equal
|
// cap pidType at w3id as from there on they are considered equal
|
||||||
|
|
||||||
UserDefinedFunction mapPid = udf(
|
UserDefinedFunction mapPid = udf(
|
||||||
(String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType);
|
(String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType);
|
||||||
|
|
||||||
|
|
|
@ -44,8 +44,10 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
new SparkCreateSimRels(parser, getSparkSession(conf))
|
try (SparkSession session = getSparkSession(conf)) {
|
||||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
new SparkCreateSimRels(parser, session)
|
||||||
|
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
|
(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
|
||||||
"",
|
"",
|
||||||
r._1()._2().getOriginalId().get(0),
|
Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
|
||||||
r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
|
Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
|
Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
|
Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
|
||||||
r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
|
Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getCollectedfrom().get(0).getValue(),
|
Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
|
||||||
"",
|
"",
|
||||||
structuredPropertyListToString(r._1()._2().getPid()),
|
structuredPropertyListToString(r._1()._2().getPid()),
|
||||||
parseECField(r._1()._2().getEclegalbody()),
|
parseECField(r._1()._2().getEclegalbody()),
|
||||||
|
|
|
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
final Organization o = r._2()._2();
|
final Organization o = r._2()._2();
|
||||||
return new OrgSimRel(
|
return new OrgSimRel(
|
||||||
r._1()._1(),
|
r._1()._1(),
|
||||||
o.getOriginalId().get(0),
|
Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
|
||||||
Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
|
Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
|
||||||
Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
|
Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
|
||||||
Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
|
Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
|
||||||
|
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
|
(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
|
||||||
OrgSimRel orgSimRel = r._1()._2();
|
OrgSimRel orgSimRel = r._1()._2();
|
||||||
orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
|
orgSimRel
|
||||||
|
.setLocal_id(
|
||||||
|
Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
|
||||||
return orgSimRel;
|
return orgSimRel;
|
||||||
},
|
},
|
||||||
Encoders.bean(OrgSimRel.class));
|
Encoders.bean(OrgSimRel.class));
|
||||||
|
|
|
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.ReduceFunction;
|
import org.apache.spark.api.java.function.ReduceFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
|
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.util.SparkCompatUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
import scala.Tuple3;
|
import scala.Tuple3;
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
StructType idsSchema = StructType
|
StructType idsSchema = StructType
|
||||||
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
||||||
|
|
||||||
Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
|
Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
|
||||||
|
|
||||||
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
||||||
String entityPath = graphBasePath + '/' + entityType.name();
|
String entityPath = graphBasePath + '/' + entityType.name();
|
||||||
|
|
|
@ -102,6 +102,8 @@
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
|
--conf spark.network.timeout=300s
|
||||||
|
--conf spark.shuffle.registration.timeout=50000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg>
|
<arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg>
|
||||||
|
|
|
@ -33,16 +33,14 @@
|
||||||
<description>max number of elements in a connected component</description>
|
<description>max number of elements in a connected component</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkResourceOpts</name>
|
||||||
<description>memory for driver process</description>
|
<value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
|
||||||
|
<description>spark resource options</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorMemory</name>
|
<name>sparkResourceOptsCreateMergeRel</name>
|
||||||
<description>memory for individual executor</description>
|
<value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
|
||||||
</property>
|
<description>spark resource options</description>
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozieActionShareLibForSpark2</name>
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
@ -119,9 +117,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -146,9 +142,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -174,9 +168,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOptsCreateMergeRel}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -203,9 +195,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -230,9 +220,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -257,9 +245,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -283,9 +269,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -309,9 +293,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -123,7 +123,7 @@ class EntityMergerTest implements Serializable {
|
||||||
assertEquals(dataInfo, pub_merged.getDataInfo());
|
assertEquals(dataInfo, pub_merged.getDataInfo());
|
||||||
|
|
||||||
// verify datepicker
|
// verify datepicker
|
||||||
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
|
assertEquals("2016-01-01", pub_merged.getDateofacceptance().getValue());
|
||||||
|
|
||||||
// verify authors
|
// verify authors
|
||||||
assertEquals(13, pub_merged.getAuthor().size());
|
assertEquals(13, pub_merged.getAuthor().size());
|
||||||
|
|
|
@ -78,7 +78,7 @@ public class IdGeneratorTest {
|
||||||
System.out.println("winner 3 = " + id2);
|
System.out.println("winner 3 = " + id2);
|
||||||
|
|
||||||
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
|
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
|
||||||
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
|
assertEquals("50|dedup_wf_002::345e5d1b80537b0d0e0a49241ae9e516", id2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -258,7 +258,6 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertEquals(115, sw_simrel.count());
|
assertEquals(115, sw_simrel.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// check if the first relation in the whitelist exists
|
// check if the first relation in the whitelist exists
|
||||||
assertTrue(
|
assertTrue(
|
||||||
sw_simrel
|
sw_simrel
|
||||||
|
|
|
@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(145, orgs_simrel);
|
assertEquals(86, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(181, orgs_simrel);
|
assertEquals(122, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -196,7 +196,9 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
"-la",
|
"-la",
|
||||||
"lookupurl",
|
"lookupurl",
|
||||||
"-w",
|
"-w",
|
||||||
testOutputBasePath
|
testOutputBasePath,
|
||||||
|
"-h",
|
||||||
|
""
|
||||||
});
|
});
|
||||||
|
|
||||||
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
||||||
|
|
|
@ -13,14 +13,16 @@ import java.io.Serializable;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.*;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -129,7 +131,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
|
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(37, pubs_simrel);
|
assertEquals(9, pubs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -142,7 +144,8 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
"--actionSetId", testActionSetId,
|
"--actionSetId", testActionSetId,
|
||||||
"--isLookUpUrl", "lookupurl",
|
"--isLookUpUrl", "lookupurl",
|
||||||
"--workingPath", workingPath,
|
"--workingPath", workingPath,
|
||||||
"--cutConnectedComponent", "3"
|
"--cutConnectedComponent", "3",
|
||||||
|
"-h", ""
|
||||||
}), spark)
|
}), spark)
|
||||||
.run(isLookUpService);
|
.run(isLookUpService);
|
||||||
|
|
||||||
|
@ -171,7 +174,8 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
"--graphBasePath", graphInputPath,
|
"--graphBasePath", graphInputPath,
|
||||||
"--actionSetId", testActionSetId,
|
"--actionSetId", testActionSetId,
|
||||||
"--isLookUpUrl", "lookupurl",
|
"--isLookUpUrl", "lookupurl",
|
||||||
"--workingPath", workingPath
|
"--workingPath", workingPath,
|
||||||
|
"-h", ""
|
||||||
}), spark)
|
}), spark)
|
||||||
.run(isLookUpService);
|
.run(isLookUpService);
|
||||||
|
|
||||||
|
@ -207,7 +211,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
assertTrue(dups.contains(r.getSource()));
|
assertTrue(dups.contains(r.getSource()));
|
||||||
});
|
});
|
||||||
|
|
||||||
assertEquals(32, merges.count());
|
assertEquals(26, merges.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -228,7 +232,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
||||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||||
|
|
||||||
assertEquals(3, roots.count());
|
assertEquals(4, roots.count());
|
||||||
|
|
||||||
final Dataset<Publication> pubs = spark
|
final Dataset<Publication> pubs = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -369,7 +373,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(19, publications); // 16 originals + 3 roots
|
assertEquals(20, publications); // 16 originals + 3 roots
|
||||||
|
|
||||||
long deletedPubs = spark
|
long deletedPubs = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -380,7 +384,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(mergedPubs, deletedPubs);
|
// assertEquals(mergedPubs, deletedPubs);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String classPathResourceAsString(String path) throws IOException {
|
private static String classPathResourceAsString(String path) throws IOException {
|
||||||
|
|
|
@ -169,10 +169,10 @@ public class SparkStatsTest implements Serializable {
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(414, orgs_blocks);
|
assertEquals(414, orgs_blocks);
|
||||||
assertEquals(187, pubs_blocks);
|
assertEquals(221, pubs_blocks);
|
||||||
assertEquals(128, sw_blocks);
|
assertEquals(134, sw_blocks);
|
||||||
assertEquals(192, ds_blocks);
|
assertEquals(196, ds_blocks);
|
||||||
assertEquals(194, orp_blocks);
|
assertEquals(198, orp_blocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterAll
|
@AfterAll
|
||||||
|
|
|
@ -30,7 +30,6 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.util.DownloadsReport;
|
import eu.dnetlib.doiboost.orcid.util.DownloadsReport;
|
||||||
import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector;
|
import eu.dnetlib.doiboost.orcid.util.MultiAttemptsHttpConnector;
|
||||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
|
||||||
|
|
||||||
public class OrcidClientTest {
|
public class OrcidClientTest {
|
||||||
final int REQ_LIMIT = 24;
|
final int REQ_LIMIT = 24;
|
||||||
|
@ -48,7 +47,7 @@ public class OrcidClientTest {
|
||||||
private static Path testPath;
|
private static Path testPath;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
private static void setUp() throws IOException {
|
public static void setUp() throws IOException {
|
||||||
testPath = Files.createTempDirectory(OrcidClientTest.class.getName());
|
testPath = Files.createTempDirectory(OrcidClientTest.class.getName());
|
||||||
System.out.println("using test path: " + testPath);
|
System.out.println("using test path: " + testPath);
|
||||||
}
|
}
|
||||||
|
@ -151,9 +150,9 @@ public class OrcidClientTest {
|
||||||
System.out.println(valueDt.toString());
|
System.out.println(valueDt.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
@Test
|
||||||
@Ignore
|
@Disabled
|
||||||
private void testModifiedDate() throws ParseException {
|
public void testModifiedDate() throws ParseException {
|
||||||
testDate(toRetrieveDate);
|
testDate(toRetrieveDate);
|
||||||
testDate(toNotRetrieveDate);
|
testDate(toNotRetrieveDate);
|
||||||
testDate(shortDate);
|
testDate(shortDate);
|
||||||
|
@ -226,7 +225,7 @@ public class OrcidClientTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
private void slowedDownDownloadTest() throws Exception {
|
public void slowedDownDownloadTest() throws Exception {
|
||||||
String orcid = "0000-0001-5496-1243";
|
String orcid = "0000-0001-5496-1243";
|
||||||
String record = slowedDownDownload(orcid);
|
String record = slowedDownDownload(orcid);
|
||||||
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
|
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
|
||||||
|
@ -332,7 +331,7 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Disabled
|
||||||
void testUpdatedRecord() throws Exception {
|
void testUpdatedRecord() throws Exception {
|
||||||
final String base64CompressedRecord = IOUtils
|
final String base64CompressedRecord = IOUtils
|
||||||
.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
|
.toString(getClass().getResourceAsStream("0000-0001-7281-6306.compressed.base64"));
|
||||||
|
@ -341,7 +340,7 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Disabled
|
||||||
void testUpdatedWork() throws Exception {
|
void testUpdatedWork() throws Exception {
|
||||||
final String base64CompressedWork = "H4sIAAAAAAAAAM1XS2/jNhC+51cQOuxJsiXZSR03Vmq0G6Bo013E6R56oyXaZiOJWpKy4y783zvUg5Ksh5uiCJogisX5Zjj85sHx3f1rFKI94YKyeGE4I9tAJPZZQOPtwvj9+cGaGUhIHAc4ZDFZGEcijHvv6u7A+MtcPVCSSgsUQObYzuzaccBEguVuYYxt+LHgbwKP6a11M3WnY6UzrpB7KuiahlQeF0aSrkPqGwhcisWcxpLwGIcLYydlMh+PD4fDiHGfBvDcjmMxLhGlBglSH8vsIH0qGlLqBFRIGvvDWjWQ1iMJJ2CKBANqGlNqMbkj3IpxRPq1KkypFZFoDRHa0aRfq8JoNjhnfIAJJS6xPouiIQJyeYmGQzE+cO5cXqITcItBlKyASExD0a93jiwtvJDjYXDDAqBPHoH2wMmVWGNf8xyyaEBiSTeUDHHWBpd2Nmmc10yfbgHQrHCyIRxKjQwRUoFKPRwEnIgBnQJQVdGeQgJaCRN0OMnPkaUFVbD9WkpaIndQJowf+8EFoIpTErJjBFQOBavElFpfUxwC9ZcqvQErdQXhe+oPFF8BaObupYzVsYEOARzSoZBWmKqaBMHcV0Wf8oG0beIqD+Gdkz0lhyE3NajUW6fhQFSV9Nw/MCBYyofYa0EN7wrBz13eP+Y+J6obWgE8Pdd2JpYD94P77Ezmjj13b0bu5PqPu3EXumEnxEJaEVxSUIHammsra+53z44zt2/m1/bItaeVtQ6dhs3c4XytvW75IYUchMKvEHVUyqmnWBFAS0VJrqSvQde6vp251ux2NtFuKcVOi+oK9YY0M0Cn6o4J6WkvtEK2XJ1vfPGAZxSoK8lb+SxJBbLQx1CohOLndjJUywQWUFmqEi3G6Zaqf/7buOyYJd5IYpfmf0XipfP18pDR9cQCeEuJQI/Lx36bFbVnpBeL2UwmqQw7ApAvf4GeGGQdEbENgolui/wdpjHaYCmPCIPPAmGBIsxfoLUhyRCB0SeCakEBJRKBtfJ+UBbI15TG4PaGBAhWthx8DmFYtHZQujv1CWbLLdzmmUKmHEOWCe1/zdu78bn/+YH+hCOqOzcXfFwuP6OVT/P710crwqGXFrpNaM2GT3MXarw01i15TIi3pmtJXgtbTVGf3h6HKfF+wBAnPyTfdCChudlm5gZaoG//F9pPZsGQcqqbyZN5hBau5OoIJ3PPwjTKDuG4s5MZp2rMzF5PZoK34IT6PIFOPrk+mTiVO5aJH2C+JJRjE/06eoRfpJxa4VgyYaLlaJUv/EhCfATMU/76gEOfmehL/qbJNNHjaFna+CQYB8wvo9PpPFJ5MOrJ1Ix7USBZqBl7KRNOx1d3jex7SG6zuijqCMWRusBsncjZSrM2u82UJmqzpGhvUJN2t6caIM9QQgO9c0t40UROnWsJd2Rbs+nsxpna9u30ttNkjechmzHjEST+X5CkkuNY0GzQkzyFseAf7lSZuLwdh1xSXKvvQJ4g4abTYgPV7uMt3rskohlJmMa82kQkshtyBEIYqQ+YB8X3oRHg7iFKi/bZP+Ao+T6BJhIT/vNPi8ffZs+flk+r2v0WNroZiyWn6xRmadHqTJXsjLJczElAZX6TnJdoWTM1SI2gfutv3rjeBt5t06rVvNuWup29246tlvluO+u2/G92bK9DXheL6uFd/Q3EaRDZqBIAAA==";
|
final String base64CompressedWork = "H4sIAAAAAAAAAM1XS2/jNhC+51cQOuxJsiXZSR03Vmq0G6Bo013E6R56oyXaZiOJWpKy4y783zvUg5Ksh5uiCJogisX5Zjj85sHx3f1rFKI94YKyeGE4I9tAJPZZQOPtwvj9+cGaGUhIHAc4ZDFZGEcijHvv6u7A+MtcPVCSSgsUQObYzuzaccBEguVuYYxt+LHgbwKP6a11M3WnY6UzrpB7KuiahlQeF0aSrkPqGwhcisWcxpLwGIcLYydlMh+PD4fDiHGfBvDcjmMxLhGlBglSH8vsIH0qGlLqBFRIGvvDWjWQ1iMJJ2CKBANqGlNqMbkj3IpxRPq1KkypFZFoDRHa0aRfq8JoNjhnfIAJJS6xPouiIQJyeYmGQzE+cO5cXqITcItBlKyASExD0a93jiwtvJDjYXDDAqBPHoH2wMmVWGNf8xyyaEBiSTeUDHHWBpd2Nmmc10yfbgHQrHCyIRxKjQwRUoFKPRwEnIgBnQJQVdGeQgJaCRN0OMnPkaUFVbD9WkpaIndQJowf+8EFoIpTErJjBFQOBavElFpfUxwC9ZcqvQErdQXhe+oPFF8BaObupYzVsYEOARzSoZBWmKqaBMHcV0Wf8oG0beIqD+Gdkz0lhyE3NajUW6fhQFSV9Nw/MCBYyofYa0EN7wrBz13eP+Y+J6obWgE8Pdd2JpYD94P77Ezmjj13b0bu5PqPu3EXumEnxEJaEVxSUIHammsra+53z44zt2/m1/bItaeVtQ6dhs3c4XytvW75IYUchMKvEHVUyqmnWBFAS0VJrqSvQde6vp251ux2NtFuKcVOi+oK9YY0M0Cn6o4J6WkvtEK2XJ1vfPGAZxSoK8lb+SxJBbLQx1CohOLndjJUywQWUFmqEi3G6Zaqf/7buOyYJd5IYpfmf0XipfP18pDR9cQCeEuJQI/Lx36bFbVnpBeL2UwmqQw7ApAvf4GeGGQdEbENgolui/wdpjHaYCmPCIPPAmGBIsxfoLUhyRCB0SeCakEBJRKBtfJ+UBbI15TG4PaGBAhWthx8DmFYtHZQujv1CWbLLdzmmUKmHEOWCe1/zdu78bn/+YH+hCOqOzcXfFwuP6OVT/P710crwqGXFrpNaM2GT3MXarw01i15TIi3pmtJXgtbTVGf3h6HKfF+wBAnPyTfdCChudlm5gZaoG//F9pPZsGQcqqbyZN5hBau5OoIJ3PPwjTKDuG4s5MZp2rMzF5PZoK34IT6PIFOPrk+mTiVO5aJH2C+JJRjE/06eoRfpJxa4VgyYaLlaJUv/EhCfATMU/76gEOfmehL/qbJNNHjaFna+CQYB8wvo9PpPFJ5MOrJ1Ix7USBZqBl7KRNOx1d3jex7SG6zuijqCMWRusBsncjZSrM2u82UJmqzpGhvUJN2t6caIM9QQgO9c0t40UROnWsJd2Rbs+nsxpna9u30ttNkjechmzHjEST+X5CkkuNY0GzQkzyFseAf7lSZuLwdh1xSXKvvQJ4g4abTYgPV7uMt3rskohlJmMa82kQkshtyBEIYqQ+YB8X3oRHg7iFKi/bZP+Ao+T6BJhIT/vNPi8ffZs+flk+r2v0WNroZiyWn6xRmadHqTJXsjLJczElAZX6TnJdoWTM1SI2gfutv3rjeBt5t06rVvNuWup29246tlvluO+u2/G92bK9DXheL6uFd/Q3EaRDZqBIAAA==";
|
||||||
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
||||||
|
@ -413,7 +412,7 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Disabled
|
||||||
void testDownloadedAuthor() throws Exception {
|
void testDownloadedAuthor() throws Exception {
|
||||||
final String base64CompressedWork = "H4sIAAAAAAAAAI2Yy26jMBSG932KiD0hIe1MiwiVZjGLkWbX2XRHsFOsgs3YJmnefszFFy4+mUhtVPz9P/gcH/vQ9PWrrjYXzAVh9Bjst7tgg2nBEKEfx+DP28/wOdgImVOUV4ziY3DDInjNHlKOC8ZRMnxtmlyWxyDaqU+ofg7h/uX7IYwfn+Ngo25ARUKoxJzm1TEopWySKLper1vGC4LU74+IikgTWoFRW+SyfyyfxCBag4iQhBawyoGMDjdqJrnECJAZRquYLDEPaV5jv8oyWlXj+qTiXZLGr7KMiQbnjAOR6IY1W7C6hgIwjGt6SKGfHsY13ajHYipLIcIyJ5Xw6+akdvjEtyt4wxEwM6+VGph5N2zYr2ENhQRhKsmZYChmS1j7nFs6VIBPOwImKhyfMVeFg6GAWEjrcoQ4FoBmBGwVXYhagGHDBIEX+ZzUDiqyn35VN6rJUpUJ4zc/PAI2T03FbrUKJZQszWjV3zavVOjvVfoE01qB+YUUQPGNwHTt3luxJjdqh1AxJFBKLWOrSeCcF13RtxxYtlPOPqH6m+MLwVfoMQ2kdae2ArLajc6fTxkI1nIoegs0yB426pMO+0fSw07xDKMu0XKSde5C2VvrlVMijRzFwqY7XTJI1QMLWcmEzMxtDdxfHiYSgTNJnYJ1K9y5k0tUrMgrnGGaRiuXxxuClulYUbr0nBvpkYLjvgTCGsuSoex3f1CEvRPHKI184NJKtKeaiO7cD5E61bJ4F+9DFd7d01u8Tw6H5BBvvz8f3q3nXLGIeJULGdaqeVBBRK7rS7h/fNvvk/gpedxt4923dxP7Fc3KtKuc1BhlkrfYmeN4dcmrhmbw60+HmWw2CKgbTuqc32CXKTTmeTWT6bDBjPsQ0DTpnchdaYO0ayQ2FyLIiVREqs25aU8VKYLRbK0BsyZuqvr1MU2Sm/rDdhe/2CRN6FU/b+oBVyj1zqRtC5F8kAumfTclsl+s7EoNQu64nfOaVLeezX60Z3XCULLi6GI2IZGTEeey7fec9lBAuXawIHKcpifE7GABHWfoxLVfpUNPBXoMbZWrHFsR3bPAk9J9i2sw9nW6AQT1mpk++7JhW+v44Hmt8PomJqfD13jRnvFOSxCKtu6qHoyBbQ7cMFo750UEfGaXm6bEeplXIXj2hvL6mA7tzvIwmM9pbJFBG834POZdLGi2gH2u9u0K9HMwn5PTioFWLufzmrS4oNuU9Pkt2rf/2jMs7fMdm2rQTTM+j+49AzToAVuXYA1mD2k0+XdE9vAP+JYR5NcQAAA=";
|
final String base64CompressedWork = "H4sIAAAAAAAAAI2Yy26jMBSG932KiD0hIe1MiwiVZjGLkWbX2XRHsFOsgs3YJmnefszFFy4+mUhtVPz9P/gcH/vQ9PWrrjYXzAVh9Bjst7tgg2nBEKEfx+DP28/wOdgImVOUV4ziY3DDInjNHlKOC8ZRMnxtmlyWxyDaqU+ofg7h/uX7IYwfn+Ngo25ARUKoxJzm1TEopWySKLper1vGC4LU74+IikgTWoFRW+SyfyyfxCBag4iQhBawyoGMDjdqJrnECJAZRquYLDEPaV5jv8oyWlXj+qTiXZLGr7KMiQbnjAOR6IY1W7C6hgIwjGt6SKGfHsY13ajHYipLIcIyJ5Xw6+akdvjEtyt4wxEwM6+VGph5N2zYr2ENhQRhKsmZYChmS1j7nFs6VIBPOwImKhyfMVeFg6GAWEjrcoQ4FoBmBGwVXYhagGHDBIEX+ZzUDiqyn35VN6rJUpUJ4zc/PAI2T03FbrUKJZQszWjV3zavVOjvVfoE01qB+YUUQPGNwHTt3luxJjdqh1AxJFBKLWOrSeCcF13RtxxYtlPOPqH6m+MLwVfoMQ2kdae2ArLajc6fTxkI1nIoegs0yB426pMO+0fSw07xDKMu0XKSde5C2VvrlVMijRzFwqY7XTJI1QMLWcmEzMxtDdxfHiYSgTNJnYJ1K9y5k0tUrMgrnGGaRiuXxxuClulYUbr0nBvpkYLjvgTCGsuSoex3f1CEvRPHKI184NJKtKeaiO7cD5E61bJ4F+9DFd7d01u8Tw6H5BBvvz8f3q3nXLGIeJULGdaqeVBBRK7rS7h/fNvvk/gpedxt4923dxP7Fc3KtKuc1BhlkrfYmeN4dcmrhmbw60+HmWw2CKgbTuqc32CXKTTmeTWT6bDBjPsQ0DTpnchdaYO0ayQ2FyLIiVREqs25aU8VKYLRbK0BsyZuqvr1MU2Sm/rDdhe/2CRN6FU/b+oBVyj1zqRtC5F8kAumfTclsl+s7EoNQu64nfOaVLeezX60Z3XCULLi6GI2IZGTEeey7fec9lBAuXawIHKcpifE7GABHWfoxLVfpUNPBXoMbZWrHFsR3bPAk9J9i2sw9nW6AQT1mpk++7JhW+v44Hmt8PomJqfD13jRnvFOSxCKtu6qHoyBbQ7cMFo750UEfGaXm6bEeplXIXj2hvL6mA7tzvIwmM9pbJFBG834POZdLGi2gH2u9u0K9HMwn5PTioFWLufzmrS4oNuU9Pkt2rf/2jMs7fMdm2rQTTM+j+49AzToAVuXYA1mD2k0+XdE9vAP+JYR5NcQAAA=";
|
||||||
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
||||||
|
@ -421,7 +420,7 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Disabled
|
||||||
void testDownloadedWork() throws Exception {
|
void testDownloadedWork() throws Exception {
|
||||||
final String base64CompressedWork = "H4sIAAAAAAAAANVa63LiOBb+z1Oo+LVbhbkGAlTCLE1Id9IhTQV6unr/CVvB2tiWR5Khmal5rX2BfbE9ki3b3Jzt6Y13h6pQSPrOXTo6knL10zffQxvCBWXBdbVVb1YRCWzm0GB9Xf28vLX6VSQkDhzssYBcV3dEVH8aVa62jL8M1RcKI2kBAYwNLnrtXrMPFCGW7nW10YSPBX8dq3XRb1swNGgomkaG3FBBV9SjcnddDaOVR+0qApUCMaSBJDzA3nXVlTIcNhrb7bbOuE0d+F43AtEwCENBnMjGUhtyjiSFGBqHCkkDu5gqB0rpSMgJsCJOAVmKMVRMuoRbAfbJeaoMY6h84q8gQi4Nz1NlmNQbnDNe4Ak1bLA28/0iB8TjBg1GMV5gdzxu0CGoxSBKlkMkpp44T3eINBxeyG5bKDABpJb7QF1guRpOsd/iOWRRhwSSPlNS5LNjsOHzHAXxmjlHmwBSr3DyTDgsNVLkkAxk6LDjcCIKaBJAtoo2FCagFTJBiyf5IdJwUAv2PJUaNUgXlgnju/PgBJDFKfTYzgdXFgXLYAzVLxH2wPWvrfQ9mKEVhG+oXbD4EsD+3H1txqaxgQwBPqRFIc0w2WoSBHNbLfqIF0zbfVymIbQ52VCyLVIzBRm6VeQVRFWNHuoHDASLeJH3jqDVUQXB5yrOH0ObE5UNLQe+R+1mu2U1u1Z7sGy2hq3esN2tt5oXf79qnELv8fGwkJYPmxSswD1uA6vVXrY7w+5g2G3WuxedjNsJmj2escJx33G/ZXsU5iAs/AyRR0WcjpRXBLglc0lM1BjP59bX1qw9Hn/+dH87/dy9vBikeinKkyzVHjoqJNWIk7QuE3KU6pES6O7MwsarJh44QW1KowcWOCxAC9tlzEPsGX3YrYGQICgS0JKzENach2bEoTYNyKEQzaJyQnzSqesKSaV3IhRx92L8tLAm7GerjbZUujSwlFnIobqKkTuth+Q4ED4Vqqypp5JyfK8ah5Ji0f8AZVSGT2TZVGXfBLw/liOyqdRpJqfyXr8ldyEZrehKkm8Jr/2hc3Qb7EVk9DfMJbU98pu3k+6aETXXBebCZpt23tBaBUfSZRxdo98eYmgNfRxrh3zAnldDM/37FvZ+IiWtoQfddgiaEGBIDGCG7btA7jgBP9svAK2h90l4yYqIGop5jgMHXA4J0NB9ksR+YTX0qFtfqACO01jGjDHFPx552AW2W0P3uvGROk4NLfTvCeNS8X9MaDg1rL9Qz6PYh7En3f4ZNmKS6nUfQYFmE6PYe05IYBqPFGaq5wHlYpaoDbYqxokVK+JBerz51z+BIzc+SfSdTHVrTiSYtZzGFNOdGrr5ohsLF2+NUguqppkDoua6/S6yXwAYu44pM+/HiZ1BwEDWMqYbC5fjZ+MEBwMjb4PRLdTFYWrUwiUhJH/H+G3pMl/7fjqJhTGwSwU5lnfLsVDmxIPvmRetbJeCOsvfaxWXbXWxLVziqNky51BLW1OP2JKzgNoASSa7Gk1WAfrLI9mirzBBIUD1r/W/AgrMla7CjEMOzYBJolo30/mnxd0SzadPt5+eZtMb9O7rEN1wNINgEA8Ha+IxNMdrHLCQRR4TFRCudnmB7m6GqD0YDCqW+lQqlfnndw93iw/TJ/RwN5k+TqZDNJkAQyUvUlWvktjrdgbQEeI1EapN8Grd7MOeYJlfajSxWVOMfcIhVQXgfcFsqhcceobVA/U3GjsbDCYrjVSKSz0wHo8Xym6dArRvvjsbAfUGouFr8s5lG9o72DVVSy1saDqMqlarWW+12r2GiIXXMzuAU6AQcLLqWf3mZRf6iOlsNQdda9BudhQnvNNdPWN8XA7BgU5G2k3pLADA75XD3BSnn3y+3M90SbZWGczkxiRVmfSaJrd0V8u0yG3CeYRyht7O07Ste45weuqNmhcpLO44woEPRq1eilLN/f3ntEqGPFfzi2PmudHTO3EOEKf60LdTyUeDr7KIIzKfTfqtdr896JxklQtbES/IQD7UyL+SZIJSXYhLHkHZ9oqEjPR1MRzWu550cDYdCeI9n+S4hzouUU76+UeCQJ0fjkKn0+v3m703i0Eh/z97BCDH/XAAziTIt4rH94j7s4dHbSY/HJ90e3qriBQL+MMxCGETs9j/QxiSQ5PaS63/QsZqdS8vOxdvtj7Oc//fL4dTI2LvDAfVA6erSDKe3+cPxw70j4c5HHZlfLT9iAEZYKjZkxOYKZxymJy659l/t+QZllC5bvVJrzShD5GN0/NkiaZyqNcJh0NrdngtTfp7wviaHB+SS1Ng7O+Sk3h5HodT4S8RyY78pUmGM6eEg1l8tVCa1KnvY/SgrzDKsxRLF46j+uahNKH3BE6lsIb1lUxpUhdS3WUE+u6nPP/qiyAsklumMhMz9SBNqeus0oQ+QXqwIa7m3qy87IhXnBLPI8kVXXlZMaASm5vAEqWuKYkvHMtbPdiPiIdm6dVmeVMZjX+lfnKDWmaRAT7ev6ctTfhEF3RoWnJeXlKfSXcHcsf69rk0wTd4Qx30RV9yl5et2Ipwqe/SS5MJXiU8vbIv2b/qZaC8PZ65AUwj9QJR3vx1mQ9b7VPy1FFebnSpWq7xi0qJuwA+fLYpL7rwJdLXobcSa97kM4Cl35f3YXmofp0+8R9gBc/XeXL9Vn38pH7mLTs27z9T8ky1n7ynlZ0I4le78rYzl6t/woG5krwQlpcRcLDD2UPkH5F73C9G5tFKfY0q/wa1TIHI0CgAAA==";
|
final String base64CompressedWork = "H4sIAAAAAAAAANVa63LiOBb+z1Oo+LVbhbkGAlTCLE1Id9IhTQV6unr/CVvB2tiWR5Khmal5rX2BfbE9ki3b3Jzt6Y13h6pQSPrOXTo6knL10zffQxvCBWXBdbVVb1YRCWzm0GB9Xf28vLX6VSQkDhzssYBcV3dEVH8aVa62jL8M1RcKI2kBAYwNLnrtXrMPFCGW7nW10YSPBX8dq3XRb1swNGgomkaG3FBBV9SjcnddDaOVR+0qApUCMaSBJDzA3nXVlTIcNhrb7bbOuE0d+F43AtEwCENBnMjGUhtyjiSFGBqHCkkDu5gqB0rpSMgJsCJOAVmKMVRMuoRbAfbJeaoMY6h84q8gQi4Nz1NlmNQbnDNe4Ak1bLA28/0iB8TjBg1GMV5gdzxu0CGoxSBKlkMkpp44T3eINBxeyG5bKDABpJb7QF1guRpOsd/iOWRRhwSSPlNS5LNjsOHzHAXxmjlHmwBSr3DyTDgsNVLkkAxk6LDjcCIKaBJAtoo2FCagFTJBiyf5IdJwUAv2PJUaNUgXlgnju/PgBJDFKfTYzgdXFgXLYAzVLxH2wPWvrfQ9mKEVhG+oXbD4EsD+3H1txqaxgQwBPqRFIc0w2WoSBHNbLfqIF0zbfVymIbQ52VCyLVIzBRm6VeQVRFWNHuoHDASLeJH3jqDVUQXB5yrOH0ObE5UNLQe+R+1mu2U1u1Z7sGy2hq3esN2tt5oXf79qnELv8fGwkJYPmxSswD1uA6vVXrY7w+5g2G3WuxedjNsJmj2escJx33G/ZXsU5iAs/AyRR0WcjpRXBLglc0lM1BjP59bX1qw9Hn/+dH87/dy9vBikeinKkyzVHjoqJNWIk7QuE3KU6pES6O7MwsarJh44QW1KowcWOCxAC9tlzEPsGX3YrYGQICgS0JKzENach2bEoTYNyKEQzaJyQnzSqesKSaV3IhRx92L8tLAm7GerjbZUujSwlFnIobqKkTuth+Q4ED4Vqqypp5JyfK8ah5Ji0f8AZVSGT2TZVGXfBLw/liOyqdRpJqfyXr8ldyEZrehKkm8Jr/2hc3Qb7EVk9DfMJbU98pu3k+6aETXXBebCZpt23tBaBUfSZRxdo98eYmgNfRxrh3zAnldDM/37FvZ+IiWtoQfddgiaEGBIDGCG7btA7jgBP9svAK2h90l4yYqIGop5jgMHXA4J0NB9ksR+YTX0qFtfqACO01jGjDHFPx552AW2W0P3uvGROk4NLfTvCeNS8X9MaDg1rL9Qz6PYh7En3f4ZNmKS6nUfQYFmE6PYe05IYBqPFGaq5wHlYpaoDbYqxokVK+JBerz51z+BIzc+SfSdTHVrTiSYtZzGFNOdGrr5ohsLF2+NUguqppkDoua6/S6yXwAYu44pM+/HiZ1BwEDWMqYbC5fjZ+MEBwMjb4PRLdTFYWrUwiUhJH/H+G3pMl/7fjqJhTGwSwU5lnfLsVDmxIPvmRetbJeCOsvfaxWXbXWxLVziqNky51BLW1OP2JKzgNoASSa7Gk1WAfrLI9mirzBBIUD1r/W/AgrMla7CjEMOzYBJolo30/mnxd0SzadPt5+eZtMb9O7rEN1wNINgEA8Ha+IxNMdrHLCQRR4TFRCudnmB7m6GqD0YDCqW+lQqlfnndw93iw/TJ/RwN5k+TqZDNJkAQyUvUlWvktjrdgbQEeI1EapN8Grd7MOeYJlfajSxWVOMfcIhVQXgfcFsqhcceobVA/U3GjsbDCYrjVSKSz0wHo8Xym6dArRvvjsbAfUGouFr8s5lG9o72DVVSy1saDqMqlarWW+12r2GiIXXMzuAU6AQcLLqWf3mZRf6iOlsNQdda9BudhQnvNNdPWN8XA7BgU5G2k3pLADA75XD3BSnn3y+3M90SbZWGczkxiRVmfSaJrd0V8u0yG3CeYRyht7O07Ste45weuqNmhcpLO44woEPRq1eilLN/f3ntEqGPFfzi2PmudHTO3EOEKf60LdTyUeDr7KIIzKfTfqtdr896JxklQtbES/IQD7UyL+SZIJSXYhLHkHZ9oqEjPR1MRzWu550cDYdCeI9n+S4hzouUU76+UeCQJ0fjkKn0+v3m703i0Eh/z97BCDH/XAAziTIt4rH94j7s4dHbSY/HJ90e3qriBQL+MMxCGETs9j/QxiSQ5PaS63/QsZqdS8vOxdvtj7Oc//fL4dTI2LvDAfVA6erSDKe3+cPxw70j4c5HHZlfLT9iAEZYKjZkxOYKZxymJy659l/t+QZllC5bvVJrzShD5GN0/NkiaZyqNcJh0NrdngtTfp7wviaHB+SS1Ng7O+Sk3h5HodT4S8RyY78pUmGM6eEg1l8tVCa1KnvY/SgrzDKsxRLF46j+uahNKH3BE6lsIb1lUxpUhdS3WUE+u6nPP/qiyAsklumMhMz9SBNqeus0oQ+QXqwIa7m3qy87IhXnBLPI8kVXXlZMaASm5vAEqWuKYkvHMtbPdiPiIdm6dVmeVMZjX+lfnKDWmaRAT7ev6ctTfhEF3RoWnJeXlKfSXcHcsf69rk0wTd4Qx30RV9yl5et2Ipwqe/SS5MJXiU8vbIv2b/qZaC8PZ65AUwj9QJR3vx1mQ9b7VPy1FFebnSpWq7xi0qJuwA+fLYpL7rwJdLXobcSa97kM4Cl35f3YXmofp0+8R9gBc/XeXL9Vn38pH7mLTs27z9T8ky1n7ynlZ0I4le78rYzl6t/woG5krwQlpcRcLDD2UPkH5F73C9G5tFKfY0q/wa1TIHI0CgAAA==";
|
||||||
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
final String work = ArgumentApplicationParser.decompressValue(base64CompressedWork);
|
||||||
|
|
|
@ -4,21 +4,12 @@ package eu.dnetlib.dhp.bulktag;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.BufferedOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.zip.GZIPOutputStream;
|
|
||||||
|
|
||||||
import org.apache.avro.TestAnnotation;
|
|
||||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
|
||||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
|
||||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -34,7 +25,6 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.sun.media.sound.ModelInstrumentComparator;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.api.Utils;
|
import eu.dnetlib.dhp.api.Utils;
|
||||||
import eu.dnetlib.dhp.api.model.CommunityEntityMap;
|
import eu.dnetlib.dhp.api.model.CommunityEntityMap;
|
||||||
|
|
|
@ -161,7 +161,7 @@ public class SparkResultToCommunityFromProject implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res.setContext(propagatedContexts);
|
res.setContext(propagatedContexts);
|
||||||
return MergeUtils.checkedMerge(ret, res);
|
return MergeUtils.checkedMerge(ret, res, true);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
|
@ -100,16 +100,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -132,12 +128,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -160,12 +155,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -188,12 +182,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -218,12 +211,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
|
<arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
|
@ -247,19 +239,14 @@
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=4
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=4G
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.executor.memoryOverhead=5G
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
|
@ -282,15 +269,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
@ -312,15 +296,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
@ -342,15 +323,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=4000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
@ -362,15 +340,6 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait2" to="End"/>
|
<join name="wait2" to="End"/>
|
||||||
|
|
||||||
<!-- <action name="reset_workingDir">-->
|
|
||||||
<!-- <fs>-->
|
|
||||||
<!-- <delete path="${workingDir}"/>-->
|
|
||||||
<!-- <mkdir path="${workingDir}"/>-->
|
|
||||||
<!-- </fs>-->
|
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<!-- <error to="Kill"/>-->
|
|
||||||
<!-- </action>-->
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
|
|
|
@ -90,6 +90,12 @@
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-pace-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
import org.apache.commons.lang3.SerializationUtils;
|
import org.apache.commons.lang3.SerializationUtils;
|
||||||
|
@ -29,7 +30,10 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
||||||
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
|
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
|
||||||
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
|
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
|
||||||
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
|
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
|
||||||
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
|
|
||||||
|
// commenting out the subject cleaning until we decide if we want to it or not and the implementation will
|
||||||
|
// be completed. At the moment it is not capable of expanding the whole hierarchy.
|
||||||
|
// mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
|
||||||
return mapping;
|
return mapping;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,6 +42,13 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
||||||
// TODO cleaning based on different subject vocabs can be added here
|
// TODO cleaning based on different subject vocabs can be added here
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The procedure cleans out the subject values, using a vocabulary identified by the field subject.qualifier.classid.
|
||||||
|
*
|
||||||
|
* @param vocabularyId
|
||||||
|
* @param vocabularies
|
||||||
|
* @param subject
|
||||||
|
*/
|
||||||
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
|
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
|
||||||
Subject subject) {
|
Subject subject) {
|
||||||
|
|
||||||
|
@ -49,14 +60,22 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
||||||
subject.getQualifier().setClassid(vocabularyId);
|
subject.getQualifier().setClassid(vocabularyId);
|
||||||
subject.getQualifier().setClassname(vocabulary.getName());
|
subject.getQualifier().setClassname(vocabulary.getName());
|
||||||
}
|
}
|
||||||
} else if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
|
} else {
|
||||||
Objects.nonNull(subject.getDataInfo()) &&
|
final String provenanceActionClassId = Optional
|
||||||
!"subject:fos".equals(subject.getDataInfo().getProvenanceaction())) {
|
.ofNullable(subject.getDataInfo())
|
||||||
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
|
.map(DataInfo::getProvenanceaction)
|
||||||
VocabularyTerm term = vocabulary.getTerm(subject.getValue());
|
.map(Qualifier::getClassid)
|
||||||
if (Objects.isNull(syn) && Objects.isNull(term)) {
|
.orElse(null);
|
||||||
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
|
|
||||||
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
|
if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
|
||||||
|
!"subject:fos".equals(provenanceActionClassId)) {
|
||||||
|
|
||||||
|
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
|
||||||
|
VocabularyTerm term = vocabulary.getTerm(subject.getValue());
|
||||||
|
if (Objects.isNull(syn) && Objects.isNull(term)) {
|
||||||
|
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||||
|
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -130,12 +130,13 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case claim:
|
case claim:
|
||||||
save(
|
save(
|
||||||
inputRdd.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
|
inputRdd
|
||||||
.groupByKey()
|
.keyBy(oaf -> ModelSupport.idFn().apply(oaf))
|
||||||
.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
|
.groupByKey()
|
||||||
//.mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
|
.map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())),
|
||||||
//.reduceByKey(MergeUtils::merge)
|
// .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
|
||||||
//.map(Tuple2::_2),
|
// .reduceByKey(MergeUtils::merge)
|
||||||
|
// .map(Tuple2::_2),
|
||||||
targetPath);
|
targetPath);
|
||||||
break;
|
break;
|
||||||
case graph:
|
case graph:
|
||||||
|
|
|
@ -33,7 +33,7 @@ SELECT
|
||||||
dc.officialname AS collectedfromname,
|
dc.officialname AS collectedfromname,
|
||||||
p.contracttype || '@@@' || p.contracttypescheme AS contracttype,
|
p.contracttype || '@@@' || p.contracttypescheme AS contracttype,
|
||||||
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
|
p.provenanceactionclass || '@@@' || p.provenanceactionscheme AS provenanceaction,
|
||||||
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,,
|
array_remove(array_agg(DISTINCT i.pid || '###' || i.issuertype || '@@@' || i.issuertype), NULL) AS pid,
|
||||||
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
|
array_agg(DISTINCT s.name || '###' || s.semanticclass || '@@@' || s.semanticscheme) AS subjects,
|
||||||
array_agg(DISTINCT fp.path) AS fundingtree
|
array_agg(DISTINCT fp.path) AS fundingtree
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
[
|
||||||
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": false},
|
||||||
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
|
||||||
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
|
||||||
|
]
|
|
@ -0,0 +1,166 @@
|
||||||
|
{
|
||||||
|
"cites":{
|
||||||
|
"original":"Cites",
|
||||||
|
"inverse":"IsCitedBy"
|
||||||
|
},
|
||||||
|
"compiles":{
|
||||||
|
"original":"Compiles",
|
||||||
|
"inverse":"IsCompiledBy"
|
||||||
|
},
|
||||||
|
"continues":{
|
||||||
|
"original":"Continues",
|
||||||
|
"inverse":"IsContinuedBy"
|
||||||
|
},
|
||||||
|
"derives":{
|
||||||
|
"original":"IsSourceOf",
|
||||||
|
"inverse":"IsDerivedFrom"
|
||||||
|
},
|
||||||
|
"describes":{
|
||||||
|
"original":"Describes",
|
||||||
|
"inverse":"IsDescribedBy"
|
||||||
|
},
|
||||||
|
"documents":{
|
||||||
|
"original":"Documents",
|
||||||
|
"inverse":"IsDocumentedBy"
|
||||||
|
},
|
||||||
|
"hasmetadata":{
|
||||||
|
"original":"HasMetadata",
|
||||||
|
"inverse":"IsMetadataOf"
|
||||||
|
},
|
||||||
|
"hasassociationwith":{
|
||||||
|
"original":"HasAssociationWith",
|
||||||
|
"inverse":"HasAssociationWith"
|
||||||
|
},
|
||||||
|
"haspart":{
|
||||||
|
"original":"HasPart",
|
||||||
|
"inverse":"IsPartOf"
|
||||||
|
},
|
||||||
|
"hasversion":{
|
||||||
|
"original":"HasVersion",
|
||||||
|
"inverse":"IsVersionOf"
|
||||||
|
},
|
||||||
|
"iscitedby":{
|
||||||
|
"original":"IsCitedBy",
|
||||||
|
"inverse":"Cites"
|
||||||
|
},
|
||||||
|
"iscompiledby":{
|
||||||
|
"original":"IsCompiledBy",
|
||||||
|
"inverse":"Compiles"
|
||||||
|
},
|
||||||
|
"iscontinuedby":{
|
||||||
|
"original":"IsContinuedBy",
|
||||||
|
"inverse":"Continues"
|
||||||
|
},
|
||||||
|
"isderivedfrom":{
|
||||||
|
"original":"IsDerivedFrom",
|
||||||
|
"inverse":"IsSourceOf"
|
||||||
|
},
|
||||||
|
"isdescribedby":{
|
||||||
|
"original":"IsDescribedBy",
|
||||||
|
"inverse":"Describes"
|
||||||
|
},
|
||||||
|
"isdocumentedby":{
|
||||||
|
"original":"IsDocumentedBy",
|
||||||
|
"inverse":"Documents"
|
||||||
|
},
|
||||||
|
"isidenticalto":{
|
||||||
|
"original":"IsIdenticalTo",
|
||||||
|
"inverse":"IsIdenticalTo"
|
||||||
|
},
|
||||||
|
"ismetadatafor":{
|
||||||
|
"original":"IsMetadataFor",
|
||||||
|
"inverse":"IsMetadataOf"
|
||||||
|
},
|
||||||
|
"ismetadataof":{
|
||||||
|
"original":"IsMetadataOf",
|
||||||
|
"inverse":"IsMetadataFor"
|
||||||
|
},
|
||||||
|
"isnewversionof":{
|
||||||
|
"original":"IsNewVersionOf",
|
||||||
|
"inverse":"IsPreviousVersionOf"
|
||||||
|
},
|
||||||
|
"isobsoletedby":{
|
||||||
|
"original":"IsObsoletedBy",
|
||||||
|
"inverse":"Obsoletes"
|
||||||
|
},
|
||||||
|
"isoriginalformof":{
|
||||||
|
"original":"IsOriginalFormOf",
|
||||||
|
"inverse":"IsVariantFormOf"
|
||||||
|
},
|
||||||
|
"ispartof":{
|
||||||
|
"original":"IsPartOf",
|
||||||
|
"inverse":"HasPart"
|
||||||
|
},
|
||||||
|
"ispreviousversionof":{
|
||||||
|
"original":"IsPreviousVersionOf",
|
||||||
|
"inverse":"IsNewVersionOf"
|
||||||
|
},
|
||||||
|
"isreferencedby":{
|
||||||
|
"original":"IsReferencedBy",
|
||||||
|
"inverse":"References"
|
||||||
|
},
|
||||||
|
"isrelatedto":{
|
||||||
|
"original":"IsRelatedTo",
|
||||||
|
"inverse":"IsRelatedTo"
|
||||||
|
},
|
||||||
|
"isrequiredby":{
|
||||||
|
"original":"IsRequiredBy",
|
||||||
|
"inverse":"Requires"
|
||||||
|
},
|
||||||
|
"isreviewedby":{
|
||||||
|
"original":"IsReviewedBy",
|
||||||
|
"inverse":"Reviews"
|
||||||
|
},
|
||||||
|
"issourceof":{
|
||||||
|
"original":"IsSourceOf",
|
||||||
|
"inverse":"IsDerivedFrom"
|
||||||
|
},
|
||||||
|
"issupplementedby":{
|
||||||
|
"original":"IsSupplementedBy",
|
||||||
|
"inverse":"IsSupplementTo"
|
||||||
|
},
|
||||||
|
"issupplementto":{
|
||||||
|
"original":"IsSupplementTo",
|
||||||
|
"inverse":"IsSupplementedBy"
|
||||||
|
},
|
||||||
|
"isvariantformof":{
|
||||||
|
"original":"IsVariantFormOf",
|
||||||
|
"inverse":"IsOriginalFormOf"
|
||||||
|
},
|
||||||
|
"isversionof":{
|
||||||
|
"original":"IsVersionOf",
|
||||||
|
"inverse":"HasVersion"
|
||||||
|
},
|
||||||
|
"obsoletes":{
|
||||||
|
"original":"Obsoletes",
|
||||||
|
"inverse":"IsObsoletedBy"
|
||||||
|
},
|
||||||
|
"references":{
|
||||||
|
"original":"References",
|
||||||
|
"inverse":"IsReferencedBy"
|
||||||
|
},
|
||||||
|
"requires":{
|
||||||
|
"original":"Requires",
|
||||||
|
"inverse":"IsRequiredBy"
|
||||||
|
},
|
||||||
|
"related":{
|
||||||
|
"original":"IsRelatedTo",
|
||||||
|
"inverse":"IsRelatedTo"
|
||||||
|
},
|
||||||
|
"reviews":{
|
||||||
|
"original":"Reviews",
|
||||||
|
"inverse":"IsReviewedBy"
|
||||||
|
},
|
||||||
|
"unknown":{
|
||||||
|
"original":"Unknown",
|
||||||
|
"inverse":"Unknown"
|
||||||
|
},
|
||||||
|
"isamongtopnsimilardocuments": {
|
||||||
|
"original": "IsAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "HasAmongTopNSimilarDocuments"
|
||||||
|
},
|
||||||
|
"hasamongtopnsimilardocuments": {
|
||||||
|
"original": "HasAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "IsAmongTopNSimilarDocuments"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,363 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.flat.ScholixFlat
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.{
|
||||||
|
Scholix,
|
||||||
|
ScholixCollectedFrom,
|
||||||
|
ScholixEntityId,
|
||||||
|
ScholixIdentifier,
|
||||||
|
ScholixRelationship,
|
||||||
|
ScholixResource
|
||||||
|
}
|
||||||
|
import org.apache.logging.log4j.core.appender.ConsoleAppender.Target
|
||||||
|
import org.json4s
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
case class RelationInfo(
|
||||||
|
source: String,
|
||||||
|
target: String,
|
||||||
|
relclass: String,
|
||||||
|
id: String,
|
||||||
|
collectedfrom: Seq[RelKeyValue]
|
||||||
|
) {}
|
||||||
|
case class RelKeyValue(key: String, value: String) {}
|
||||||
|
|
||||||
|
case class SummaryResource(
|
||||||
|
id: String,
|
||||||
|
typology: String,
|
||||||
|
subType: String,
|
||||||
|
pids: List[String],
|
||||||
|
pidTypes: List[String],
|
||||||
|
publishers: List[String],
|
||||||
|
date: String
|
||||||
|
) {}
|
||||||
|
|
||||||
|
object ScholexplorerUtils {
|
||||||
|
|
||||||
|
val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
case class RelationVocabulary(original: String, inverse: String) {}
|
||||||
|
|
||||||
|
val relations: Map[String, RelationVocabulary] = {
|
||||||
|
val input = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
|
||||||
|
json.extract[Map[String, RelationVocabulary]]
|
||||||
|
}
|
||||||
|
|
||||||
|
def invRel(rel: String): String = {
|
||||||
|
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
|
||||||
|
if (semanticRelation != null)
|
||||||
|
semanticRelation.inverse
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateDatasourceOpenAIREURLS(id: String): String = {
|
||||||
|
if (id != null && id.length > 12)
|
||||||
|
s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
def findURLForPID(
|
||||||
|
pidValue: List[StructuredProperty],
|
||||||
|
urls: List[String]
|
||||||
|
): List[(StructuredProperty, String)] = {
|
||||||
|
pidValue.map { p =>
|
||||||
|
val pv = p.getValue
|
||||||
|
|
||||||
|
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||||
|
(p, r.orNull)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||||
|
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||||
|
return List()
|
||||||
|
r.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||||
|
.filter(i => i.getPid != null && i.getUrl != null)
|
||||||
|
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||||
|
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||||
|
.distinct
|
||||||
|
.toList
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateSummaryResource(input: ScholixResource): SummaryResource = {
|
||||||
|
val distinctIds = if (input.getIdentifier != null) {
|
||||||
|
input.getIdentifier.asScala.map(i => i.getIdentifier).distinct.toList
|
||||||
|
} else List()
|
||||||
|
val distinctTypes = if (input.getIdentifier != null) {
|
||||||
|
input.getIdentifier.asScala.map(i => i.getSchema).distinct.toList
|
||||||
|
} else List()
|
||||||
|
val distinctPublishers = if (input.getPublisher != null) {
|
||||||
|
input.getPublisher.asScala.map(i => i.getName).distinct.sorted.take(5).toList
|
||||||
|
} else List()
|
||||||
|
SummaryResource(
|
||||||
|
id = input.getDnetIdentifier,
|
||||||
|
typology = input.getObjectType,
|
||||||
|
subType = input.getObjectSubType,
|
||||||
|
pids = distinctIds,
|
||||||
|
pidTypes = distinctTypes,
|
||||||
|
publishers = distinctPublishers,
|
||||||
|
date = input.getPublicationDate
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholixFlat(relation: RelationInfo, summary: SummaryResource, updateSource: Boolean): ScholixFlat = {
|
||||||
|
val scholix = new ScholixFlat
|
||||||
|
scholix.setIdentifier(relation.id)
|
||||||
|
if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
|
||||||
|
scholix.setLinkProviders(
|
||||||
|
relation.collectedfrom
|
||||||
|
.map(cf => {
|
||||||
|
cf.value
|
||||||
|
})
|
||||||
|
.distinct
|
||||||
|
.sorted
|
||||||
|
.take(5)
|
||||||
|
.toList
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
else {
|
||||||
|
scholix.setLinkProviders(List("OpenAIRE").asJava)
|
||||||
|
}
|
||||||
|
val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
|
||||||
|
if (semanticRelation == null)
|
||||||
|
return null
|
||||||
|
|
||||||
|
scholix.setRelationType(semanticRelation.original)
|
||||||
|
scholix.setPublicationDate(summary.date)
|
||||||
|
if (updateSource) {
|
||||||
|
if (summary.pids.isEmpty)
|
||||||
|
return null
|
||||||
|
scholix.setSourceId(summary.id)
|
||||||
|
scholix.setSourcePid(summary.pids.asJava)
|
||||||
|
scholix.setSourcePidType(summary.pidTypes.asJava)
|
||||||
|
scholix.setSourceType(summary.typology)
|
||||||
|
scholix.setSourceSubType(summary.subType)
|
||||||
|
if (summary.publishers.nonEmpty) {
|
||||||
|
scholix.setSourcePublisher(summary.publishers.asJava)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (summary.pids.isEmpty)
|
||||||
|
return null
|
||||||
|
scholix.setTargetId(summary.id)
|
||||||
|
scholix.setTargetPid(summary.pids.asJava)
|
||||||
|
scholix.setTargetPidType(summary.pidTypes.asJava)
|
||||||
|
scholix.setTargetType(summary.typology)
|
||||||
|
scholix.setTargetSubType(summary.subType)
|
||||||
|
if (summary.publishers.nonEmpty) {
|
||||||
|
scholix.setTargetPublisher(summary.publishers.asJava)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
scholix
|
||||||
|
}
|
||||||
|
|
||||||
|
def mergeScholixFlat(source: ScholixFlat, target: ScholixFlat): ScholixFlat = {
|
||||||
|
if (source.getPublicationDate == null) {
|
||||||
|
source.setPublicationDate(target.getPublicationDate)
|
||||||
|
}
|
||||||
|
|
||||||
|
source.setTargetId(target.getTargetId)
|
||||||
|
source.setTargetPid(target.getTargetPid)
|
||||||
|
source.setTargetPidType(target.getTargetPidType)
|
||||||
|
source.setTargetType(target.getTargetType)
|
||||||
|
source.setTargetSubType(target.getTargetSubType)
|
||||||
|
|
||||||
|
if (source.getLinkProviders != null)
|
||||||
|
source.setTargetPublisher(target.getTargetPublisher)
|
||||||
|
else if (source.getLinkProviders != null && target.getLinkProviders != null) {
|
||||||
|
source.setLinkProviders(
|
||||||
|
source.getLinkProviders.asScala.union(target.getLinkProviders.asScala).sorted.distinct.take(5).asJava
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
source
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholixResourceFromResult(result: Result): ScholixResource = {
|
||||||
|
|
||||||
|
if (result.getInstance() == null || result.getInstance().size() == 0)
|
||||||
|
return null
|
||||||
|
|
||||||
|
if (result.getPid == null || result.getPid.isEmpty)
|
||||||
|
return null
|
||||||
|
|
||||||
|
val r = new ScholixResource
|
||||||
|
r.setDnetIdentifier(result.getId)
|
||||||
|
|
||||||
|
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
|
||||||
|
if (persistentIdentifiers.isEmpty)
|
||||||
|
return null
|
||||||
|
|
||||||
|
r.setIdentifier(persistentIdentifiers.asJava)
|
||||||
|
|
||||||
|
r.setObjectType(result.getResulttype.getClassid)
|
||||||
|
|
||||||
|
r.setObjectSubType(
|
||||||
|
result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i != null && i.getInstancetype != null)
|
||||||
|
.map(i => i.getInstancetype.getClassname)
|
||||||
|
.distinct
|
||||||
|
.head
|
||||||
|
)
|
||||||
|
|
||||||
|
if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
|
||||||
|
val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
|
||||||
|
if (titles.nonEmpty)
|
||||||
|
r.setTitle(titles.head)
|
||||||
|
else
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
if (result.getAuthor != null && !result.getAuthor.isEmpty) {
|
||||||
|
val authors: List[ScholixEntityId] =
|
||||||
|
result.getAuthor.asScala
|
||||||
|
.map(a => {
|
||||||
|
val entity = new ScholixEntityId()
|
||||||
|
entity.setName(a.getFullname)
|
||||||
|
if (a.getPid != null && a.getPid.size() > 0)
|
||||||
|
entity.setIdentifiers(
|
||||||
|
a.getPid.asScala
|
||||||
|
.map(sp => {
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(sp.getValue)
|
||||||
|
id.setSchema(sp.getQualifier.getClassid)
|
||||||
|
id
|
||||||
|
})
|
||||||
|
.take(3)
|
||||||
|
.toList
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
entity
|
||||||
|
})
|
||||||
|
.toList
|
||||||
|
if (authors.nonEmpty)
|
||||||
|
r.setCreator(authors.asJava)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
val dt: List[String] = result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getDateofacceptance != null)
|
||||||
|
.map(i => i.getDateofacceptance.getValue)
|
||||||
|
.toList
|
||||||
|
if (dt.nonEmpty)
|
||||||
|
r.setPublicationDate(dt.distinct.head)
|
||||||
|
|
||||||
|
r.setPublisher(
|
||||||
|
result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.map(i => i.getHostedby)
|
||||||
|
.filter(h => !"unknown".equalsIgnoreCase(h.getValue))
|
||||||
|
.map(h => {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(h.getValue)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(h.getKey)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
eid
|
||||||
|
})
|
||||||
|
.distinct
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
|
r.setCollectedFrom(
|
||||||
|
result.getCollectedfrom.asScala
|
||||||
|
.map(cf => {
|
||||||
|
val scf = new ScholixCollectedFrom()
|
||||||
|
scf.setProvisionMode("collected")
|
||||||
|
scf.setCompletionStatus("complete")
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(cf.getValue)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(cf.getKey)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
scf.setProvider(eid)
|
||||||
|
scf
|
||||||
|
})
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
|
r
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
|
||||||
|
val s: Scholix = new Scholix
|
||||||
|
s.setSource(source)
|
||||||
|
if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
|
||||||
|
s.setLinkprovider(
|
||||||
|
relation.collectedfrom
|
||||||
|
.map(cf => {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(cf.value)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(cf.key)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
eid
|
||||||
|
})
|
||||||
|
.toList
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
else {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName("OpenAIRE")
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
s.setLinkprovider(List(eid).asJava)
|
||||||
|
}
|
||||||
|
s.setIdentifier(relation.id)
|
||||||
|
val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
|
||||||
|
if (semanticRelation == null)
|
||||||
|
return null
|
||||||
|
s.setRelationship(
|
||||||
|
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||||
|
)
|
||||||
|
s.setPublicationDate(source.getPublicationDate)
|
||||||
|
s.setPublisher(source.getPublisher)
|
||||||
|
val mockTarget = new ScholixResource
|
||||||
|
mockTarget.setDnetIdentifier(relation.target)
|
||||||
|
s.setTarget(mockTarget)
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
|
def updateTarget(s: Scholix, t: ScholixResource): String = {
|
||||||
|
|
||||||
|
s.setTarget(t)
|
||||||
|
val spublishers: Seq[ScholixEntityId] =
|
||||||
|
if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
|
||||||
|
val tpublishers: Seq[ScholixEntityId] =
|
||||||
|
if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
|
||||||
|
val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
|
||||||
|
s.setPublisher(mergedPublishers.asJava)
|
||||||
|
mapper.writeValueAsString(s)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,183 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{
|
||||||
|
KeyValue,
|
||||||
|
OtherResearchProduct,
|
||||||
|
Publication,
|
||||||
|
Relation,
|
||||||
|
Result,
|
||||||
|
Software,
|
||||||
|
Dataset => OafDataset
|
||||||
|
}
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.flat.ScholixFlat
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
|
||||||
|
import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
import org.apache.spark.sql._
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
|
||||||
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
|
/** Here all the spark applications runs this method
|
||||||
|
* where the whole logic of the spark node is defined
|
||||||
|
*/
|
||||||
|
override def run(): Unit = {
|
||||||
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
log.info("sourcePath: {}", sourcePath)
|
||||||
|
val targetPath = parser.get("targetPath")
|
||||||
|
log.info("targetPath: {}", targetPath)
|
||||||
|
generateBidirectionalRelations(sourcePath, targetPath, spark)
|
||||||
|
generateScholixResource(sourcePath, targetPath, spark)
|
||||||
|
generateFlatScholix(targetPath, spark)
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
|
||||||
|
val entityMap: Map[String, StructType] = Map(
|
||||||
|
"publication" -> Encoders.bean(classOf[Publication]).schema,
|
||||||
|
"dataset" -> Encoders.bean(classOf[OafDataset]).schema,
|
||||||
|
"software" -> Encoders.bean(classOf[Software]).schema,
|
||||||
|
"otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
|
||||||
|
)
|
||||||
|
|
||||||
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
|
||||||
|
implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
|
||||||
|
|
||||||
|
val resDs = spark.emptyDataset[ScholixResource]
|
||||||
|
val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
|
||||||
|
println(s"adding ${item._1}")
|
||||||
|
res.union(
|
||||||
|
spark.read
|
||||||
|
.schema(item._2)
|
||||||
|
.json(s"$inputPath/${item._1}")
|
||||||
|
.as[Result]
|
||||||
|
.map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
|
||||||
|
.filter(s => s != null)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
|
||||||
|
val relSchema = Encoders.bean(classOf[Relation]).schema
|
||||||
|
|
||||||
|
val relDF = spark.read
|
||||||
|
.schema(relSchema)
|
||||||
|
.json(s"$inputPath/relation")
|
||||||
|
.where(
|
||||||
|
"datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
|
||||||
|
"and relClass <> 'merges' and relClass <> 'isMergedIn'"
|
||||||
|
)
|
||||||
|
.select("source", "target", "collectedfrom", "relClass")
|
||||||
|
|
||||||
|
def invRel: String => String = { s =>
|
||||||
|
ScholexplorerUtils.invRel(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
import org.apache.spark.sql.functions.udf
|
||||||
|
val inverseRelationUDF = udf(invRel)
|
||||||
|
val inverseRelation = relDF.select(
|
||||||
|
col("target").alias("source"),
|
||||||
|
col("source").alias("target"),
|
||||||
|
col("collectedfrom"),
|
||||||
|
inverseRelationUDF(col("relClass")).alias("relClass")
|
||||||
|
)
|
||||||
|
|
||||||
|
val bidRel = inverseRelation
|
||||||
|
.union(relDF)
|
||||||
|
.withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
|
||||||
|
.withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
|
||||||
|
.drop("collectedfrom")
|
||||||
|
.withColumnRenamed("cf", "collectedfrom")
|
||||||
|
.groupBy(col("id"))
|
||||||
|
.agg(
|
||||||
|
first("source").alias("source"),
|
||||||
|
first("target").alias("target"),
|
||||||
|
first("relClass").alias("relClass"),
|
||||||
|
first("collectedfrom").alias("collectedfrom")
|
||||||
|
)
|
||||||
|
|
||||||
|
bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateFlatScholix(outputPath: String, spark: SparkSession): Unit = {
|
||||||
|
import spark.implicits._
|
||||||
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
|
||||||
|
implicit val scholixEncoder: Encoder[ScholixFlat] = Encoders.bean(classOf[ScholixFlat])
|
||||||
|
val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
|
||||||
|
val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
|
||||||
|
|
||||||
|
resource
|
||||||
|
.map(s => ScholexplorerUtils.generateSummaryResource(s))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$outputPath/summary")
|
||||||
|
val summaries = spark.read.load(s"$outputPath/summary").as[SummaryResource]
|
||||||
|
|
||||||
|
relations
|
||||||
|
.joinWith(summaries, relations("source") === summaries("id"))
|
||||||
|
.map(k => ScholexplorerUtils.generateScholixFlat(k._1, k._2, true))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$outputPath/scholix_source")
|
||||||
|
|
||||||
|
val scholix_source = spark.read.load(s"$outputPath/scholix_source").as[ScholixFlat]
|
||||||
|
|
||||||
|
relations
|
||||||
|
.joinWith(summaries, relations("target") === summaries("id"))
|
||||||
|
.map(k => ScholexplorerUtils.generateScholixFlat(k._1, k._2, false))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.save(s"$outputPath/scholix_target")
|
||||||
|
|
||||||
|
val scholix_target = spark.read.load(s"$outputPath/scholix_target").as[ScholixFlat]
|
||||||
|
|
||||||
|
scholix_source
|
||||||
|
.joinWith(scholix_target, scholix_source("identifier") === scholix_target("identifier"), "inner")
|
||||||
|
.map(s => ScholexplorerUtils.mergeScholixFlat(s._1, s._2))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(s"$outputPath/scholix")
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholix(outputPath: String, spark: SparkSession): Unit = {
|
||||||
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
|
||||||
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
|
||||||
|
val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
|
||||||
|
|
||||||
|
val scholix_one_verse = relations
|
||||||
|
.joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
|
||||||
|
.map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
|
||||||
|
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
|
||||||
|
|
||||||
|
val resourceTarget = relations
|
||||||
|
.joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
|
||||||
|
.map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
|
||||||
|
|
||||||
|
scholix_one_verse
|
||||||
|
.joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
|
||||||
|
.map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(s"$outputPath/scholix")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object SparkCreateScholexplorerDump {
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
new SparkCreateScholexplorerDump(
|
||||||
|
log = logger,
|
||||||
|
args = args,
|
||||||
|
propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
|
||||||
|
).initialize().run()
|
||||||
|
}
|
||||||
|
}
|
|
@ -71,7 +71,7 @@ class GenerateEntitiesApplicationTest {
|
||||||
|
|
||||||
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
|
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
|
||||||
String resultType) {
|
String resultType) {
|
||||||
final Result merge = MergeUtils.mergeResult(publication, dataset);
|
final Result merge = (Result) MergeUtils.merge(publication, dataset);
|
||||||
assertTrue(clazz.isAssignableFrom(merge.getClass()));
|
assertTrue(clazz.isAssignableFrom(merge.getClass()));
|
||||||
assertEquals(resultType, merge.getResulttype().getClassid());
|
assertEquals(resultType, merge.getResulttype().getClassid());
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph.scholix
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
|
||||||
|
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
|
import org.junit.jupiter.api.Test
|
||||||
|
import org.objenesis.strategy.StdInstantiatorStrategy
|
||||||
|
|
||||||
|
class ScholixGenerationTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def generateScholix(): Unit = {
|
||||||
|
|
||||||
|
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
|
||||||
|
val app = new SparkCreateScholexplorerDump(null, null, null)
|
||||||
|
|
||||||
|
val basePath = "/Users/sandro/Downloads"
|
||||||
|
app.generateScholixResource(s"$basePath/scholix_sample/", s"$basePath/scholix/", spark)
|
||||||
|
app.generateBidirectionalRelations(
|
||||||
|
s"$basePath/scholix_sample/",
|
||||||
|
s"$basePath/scholix/",
|
||||||
|
spark
|
||||||
|
)
|
||||||
|
app.generateFlatScholix(s"$basePath/scholix/", spark)
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,7 +18,7 @@
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
<phase>initialize</phase>
|
<phase>process-resources</phase>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>add-source</goal>
|
<goal>add-source</goal>
|
||||||
<goal>compile</goal>
|
<goal>compile</goal>
|
||||||
|
@ -59,12 +59,6 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
<exclusions>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>org.slf4j</groupId>
|
|
||||||
<artifactId>slf4j-api</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
</exclusions>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>dom4j</groupId>
|
<groupId>dom4j</groupId>
|
||||||
|
@ -160,6 +154,26 @@
|
||||||
<groupId>org.apache.zookeeper</groupId>
|
<groupId>org.apache.zookeeper</groupId>
|
||||||
<artifactId>zookeeper</artifactId>
|
<artifactId>zookeeper</artifactId>
|
||||||
</exclusion>
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>ant</artifactId>
|
||||||
|
<groupId>org.apache.ant</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>antlr4-runtime</artifactId>
|
||||||
|
<groupId>org.antlr</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>woodstox-core</artifactId>
|
||||||
|
<groupId>com.fasterxml.woodstox</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>log4j</artifactId>
|
||||||
|
<groupId>*</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -206,5 +220,90 @@
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-24</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-3</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-4</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-35</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-4</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
result
|
result
|
||||||
.getTitle()
|
.getTitle()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(t -> StringUtils.isNotBlank(t.getValue()))
|
||||||
.findFirst()
|
.findFirst()
|
||||||
.map(StructuredProperty::getValue)
|
|
||||||
.ifPresent(
|
.ifPresent(
|
||||||
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
|
title -> {
|
||||||
|
re.setTitle(title);
|
||||||
|
re
|
||||||
|
.getTitle()
|
||||||
|
.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
||||||
result
|
result
|
||||||
|
|
|
@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
||||||
import static org.apache.spark.sql.functions.*;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.SparkContext;
|
import org.apache.spark.SparkContext;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.expressions.UserDefinedFunction;
|
|
||||||
import org.apache.spark.sql.types.DataTypes;
|
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -45,9 +37,9 @@ import scala.Tuple2;
|
||||||
/**
|
/**
|
||||||
* XmlConverterJob converts the JoinedEntities as XML records
|
* XmlConverterJob converts the JoinedEntities as XML records
|
||||||
*/
|
*/
|
||||||
public class XmlConverterJob {
|
public class PayloadConverterJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
|
private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);
|
||||||
|
|
||||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||||
|
|
||||||
|
@ -56,8 +48,8 @@ public class XmlConverterJob {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
XmlConverterJob.class
|
PayloadConverterJob.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final Boolean isSparkSessionManaged = Optional
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
@ -72,6 +64,12 @@ public class XmlConverterJob {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final Boolean validateXML = Optional
|
||||||
|
.ofNullable(parser.get("validateXML"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.FALSE);
|
||||||
|
log.info("validateXML: {}", validateXML);
|
||||||
|
|
||||||
final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
|
final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
|
||||||
log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
|
log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
|
||||||
|
|
||||||
|
@ -86,18 +84,19 @@ public class XmlConverterJob {
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
convertToXml(
|
createPayloads(
|
||||||
spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
|
spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
|
||||||
VocabularyGroup.loadVocsFromIS(isLookup));
|
VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void convertToXml(
|
private static void createPayloads(
|
||||||
final SparkSession spark,
|
final SparkSession spark,
|
||||||
final String inputPath,
|
final String inputPath,
|
||||||
final String outputPath,
|
final String outputPath,
|
||||||
final ContextMapper contextMapper,
|
final ContextMapper contextMapper,
|
||||||
final VocabularyGroup vocabularies) {
|
final VocabularyGroup vocabularies,
|
||||||
|
final Boolean validateXML) {
|
||||||
|
|
||||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(
|
final XmlRecordFactory recordFactory = new XmlRecordFactory(
|
||||||
prepareAccumulators(spark.sparkContext()),
|
prepareAccumulators(spark.sparkContext()),
|
||||||
|
@ -118,7 +117,7 @@ public class XmlConverterJob {
|
||||||
.as(Encoders.kryo(JoinedEntity.class))
|
.as(Encoders.kryo(JoinedEntity.class))
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
|
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
|
||||||
recordFactory.build(je),
|
recordFactory.build(je, validateXML),
|
||||||
ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
|
ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
|
||||||
.map(
|
.map(
|
|
@ -2,42 +2,34 @@
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
import static org.apache.spark.sql.functions.col;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.PriorityQueue;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
|
||||||
import org.apache.spark.api.java.function.Function;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoder;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.sql.expressions.Aggregator;
|
import org.apache.spark.sql.expressions.Window;
|
||||||
|
import org.apache.spark.sql.expressions.WindowSpec;
|
||||||
|
import org.apache.spark.sql.functions;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
|
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
|
||||||
|
@ -130,132 +122,36 @@ public class PrepareRelationsJob {
|
||||||
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
|
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
|
||||||
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
|
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
|
||||||
|
|
||||||
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
|
WindowSpec source_w = Window
|
||||||
.filter(rel -> !(rel.getSource().startsWith("unresolved") || rel.getTarget().startsWith("unresolved")))
|
.partitionBy("source", "subRelType")
|
||||||
.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
|
.orderBy(col("target").desc_nulls_last());
|
||||||
.filter(rel -> !relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())));
|
|
||||||
|
|
||||||
JavaRDD<Relation> pruned = pruneRels(
|
WindowSpec target_w = Window
|
||||||
pruneRels(
|
.partitionBy("target", "subRelType")
|
||||||
rels,
|
.orderBy(col("source").desc_nulls_last());
|
||||||
sourceMaxRelations, relPartitions, (Function<Relation, String>) Relation::getSource),
|
|
||||||
targetMaxRelations, relPartitions, (Function<Relation, String>) Relation::getTarget);
|
|
||||||
spark
|
|
||||||
.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
|
|
||||||
.repartition(relPartitions)
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.parquet(outputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, int maxRelations,
|
|
||||||
int relPartitions, Function<Relation, String> idFn) {
|
|
||||||
return rels
|
|
||||||
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
|
|
||||||
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
|
||||||
.groupBy(Tuple2::_1)
|
|
||||||
.map(Tuple2::_2)
|
|
||||||
.map(t -> Iterables.limit(t, maxRelations))
|
|
||||||
.flatMap(Iterable::iterator)
|
|
||||||
.map(Tuple2::_2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// experimental
|
|
||||||
private static void prepareRelationsDataset(
|
|
||||||
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
|
|
||||||
int relPartitions) {
|
|
||||||
spark
|
spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(inputRelationsPath)
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
.repartition(relPartitions)
|
.json(inputRelationsPath)
|
||||||
.map(
|
.where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
|
||||||
(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
|
.where("datainfo.deletedbyinference != true")
|
||||||
Encoders.kryo(Relation.class))
|
.where(
|
||||||
.filter((FilterFunction<Relation>) rel -> !rel.getDataInfo().getDeletedbyinference())
|
relationFilter.isEmpty() ? ""
|
||||||
.filter((FilterFunction<Relation>) rel -> !relationFilter.contains(rel.getRelClass()))
|
: "lower(relClass) NOT IN ("
|
||||||
.groupByKey(
|
+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
|
||||||
(MapFunction<Relation, String>) Relation::getSource,
|
.withColumn("source_w_pos", functions.row_number().over(source_w))
|
||||||
Encoders.STRING())
|
.where("source_w_pos < " + sourceMaxRelations)
|
||||||
.agg(new RelationAggregator(maxRelations).toColumn())
|
.drop("source_w_pos")
|
||||||
.flatMap(
|
.withColumn("target_w_pos", functions.row_number().over(target_w))
|
||||||
(FlatMapFunction<Tuple2<String, RelationList>, Relation>) t -> Iterables
|
.where("target_w_pos < " + targetMaxRelations)
|
||||||
.limit(t._2().getRelations(), maxRelations)
|
.drop("target_w_pos")
|
||||||
.iterator(),
|
.coalesce(relPartitions)
|
||||||
Encoders.bean(Relation.class))
|
|
||||||
.repartition(relPartitions)
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.parquet(outputPath);
|
.parquet(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class RelationAggregator
|
|
||||||
extends Aggregator<Relation, RelationList, RelationList> {
|
|
||||||
|
|
||||||
private final int maxRelations;
|
|
||||||
|
|
||||||
public RelationAggregator(int maxRelations) {
|
|
||||||
this.maxRelations = maxRelations;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList zero() {
|
|
||||||
return new RelationList();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList reduce(RelationList b, Relation a) {
|
|
||||||
b.getRelations().add(a);
|
|
||||||
return getSortableRelationList(b);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList merge(RelationList b1, RelationList b2) {
|
|
||||||
b1.getRelations().addAll(b2.getRelations());
|
|
||||||
return getSortableRelationList(b1);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList finish(RelationList r) {
|
|
||||||
return getSortableRelationList(r);
|
|
||||||
}
|
|
||||||
|
|
||||||
private RelationList getSortableRelationList(RelationList b1) {
|
|
||||||
RelationList sr = new RelationList();
|
|
||||||
sr
|
|
||||||
.setRelations(
|
|
||||||
b1
|
|
||||||
.getRelations()
|
|
||||||
.stream()
|
|
||||||
.limit(maxRelations)
|
|
||||||
.collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator()))));
|
|
||||||
return sr;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Encoder<RelationList> bufferEncoder() {
|
|
||||||
return Encoders.kryo(RelationList.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Encoder<RelationList> outputEncoder() {
|
|
||||||
return Encoders.kryo(RelationList.class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
|
|
||||||
* file,
|
|
||||||
*
|
|
||||||
* @param spark
|
|
||||||
* @param inputPath
|
|
||||||
* @return the JavaRDD<SortableRelation> containing all the relationships
|
|
||||||
*/
|
|
||||||
private static JavaRDD<Relation> readPathRelationRDD(
|
|
||||||
SparkSession spark, final String inputPath) {
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void removeOutputDir(SparkSession spark, String path) {
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import com.google.common.collect.ComparisonChain;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
public class RelationComparator implements Comparator<Relation> {
|
|
||||||
|
|
||||||
private static final Map<String, Integer> weights = Maps.newHashMap();
|
|
||||||
|
|
||||||
static {
|
|
||||||
weights.put(ModelConstants.OUTCOME, 0);
|
|
||||||
weights.put(ModelConstants.SUPPLEMENT, 1);
|
|
||||||
weights.put(ModelConstants.REVIEW, 2);
|
|
||||||
weights.put(ModelConstants.CITATION, 3);
|
|
||||||
weights.put(ModelConstants.AFFILIATION, 4);
|
|
||||||
weights.put(ModelConstants.RELATIONSHIP, 5);
|
|
||||||
weights.put(ModelConstants.PUBLICATION_DATASET, 6);
|
|
||||||
weights.put(ModelConstants.SIMILARITY, 7);
|
|
||||||
|
|
||||||
weights.put(ModelConstants.PROVISION, 8);
|
|
||||||
weights.put(ModelConstants.PARTICIPATION, 9);
|
|
||||||
weights.put(ModelConstants.DEDUP, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Integer getWeight(Relation o) {
|
|
||||||
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(Relation o1, Relation o2) {
|
|
||||||
return ComparisonChain
|
|
||||||
.start()
|
|
||||||
.compare(getWeight(o1), getWeight(o2))
|
|
||||||
.result();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,25 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.PriorityQueue;
|
|
||||||
import java.util.Queue;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
public class RelationList implements Serializable {
|
|
||||||
|
|
||||||
private Queue<Relation> relations;
|
|
||||||
|
|
||||||
public RelationList() {
|
|
||||||
this.relations = new PriorityQueue<>(new RelationComparator());
|
|
||||||
}
|
|
||||||
|
|
||||||
public Queue<Relation> getRelations() {
|
|
||||||
return relations;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setRelations(Queue<Relation> relations) {
|
|
||||||
this.relations = relations;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,81 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.google.common.collect.ComparisonChain;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
public class SortableRelation extends Relation implements Comparable<SortableRelation>, Serializable {
|
|
||||||
|
|
||||||
private static final Map<String, Integer> weights = Maps.newHashMap();
|
|
||||||
|
|
||||||
static {
|
|
||||||
weights.put(ModelConstants.OUTCOME, 0);
|
|
||||||
weights.put(ModelConstants.SUPPLEMENT, 1);
|
|
||||||
weights.put(ModelConstants.REVIEW, 2);
|
|
||||||
weights.put(ModelConstants.CITATION, 3);
|
|
||||||
weights.put(ModelConstants.AFFILIATION, 4);
|
|
||||||
weights.put(ModelConstants.RELATIONSHIP, 5);
|
|
||||||
weights.put(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, 6);
|
|
||||||
weights.put(ModelConstants.SIMILARITY, 7);
|
|
||||||
|
|
||||||
weights.put(ModelConstants.PROVISION, 8);
|
|
||||||
weights.put(ModelConstants.PARTICIPATION, 9);
|
|
||||||
weights.put(ModelConstants.DEDUP, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final long serialVersionUID = 34753984579L;
|
|
||||||
|
|
||||||
private String groupingKey;
|
|
||||||
|
|
||||||
public static SortableRelation create(Relation r, String groupingKey) {
|
|
||||||
SortableRelation sr = new SortableRelation();
|
|
||||||
sr.setGroupingKey(groupingKey);
|
|
||||||
sr.setSource(r.getSource());
|
|
||||||
sr.setTarget(r.getTarget());
|
|
||||||
sr.setRelType(r.getRelType());
|
|
||||||
sr.setSubRelType(r.getSubRelType());
|
|
||||||
sr.setRelClass(r.getRelClass());
|
|
||||||
sr.setDataInfo(r.getDataInfo());
|
|
||||||
sr.setCollectedfrom(r.getCollectedfrom());
|
|
||||||
sr.setLastupdatetimestamp(r.getLastupdatetimestamp());
|
|
||||||
sr.setProperties(r.getProperties());
|
|
||||||
sr.setValidated(r.getValidated());
|
|
||||||
sr.setValidationDate(r.getValidationDate());
|
|
||||||
|
|
||||||
return sr;
|
|
||||||
}
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public Relation asRelation() {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(SortableRelation o) {
|
|
||||||
return ComparisonChain
|
|
||||||
.start()
|
|
||||||
.compare(getGroupingKey(), o.getGroupingKey())
|
|
||||||
.compare(getWeight(this), getWeight(o))
|
|
||||||
.result();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Integer getWeight(SortableRelation o) {
|
|
||||||
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getGroupingKey() {
|
|
||||||
return groupingKey;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setGroupingKey(String groupingKey) {
|
|
||||||
this.groupingKey = groupingKey;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -25,6 +25,7 @@ import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
|
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
|
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
|
||||||
|
import eu.dnetlib.dhp.sparksolr.DHPSolrSupport;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
@ -129,7 +130,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.map(
|
.map(
|
||||||
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
|
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
|
||||||
SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
|
DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision.model;
|
package eu.dnetlib.dhp.oa.provision.model;
|
||||||
|
|
||||||
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -16,16 +14,15 @@ import org.jetbrains.annotations.Nullable;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
|
||||||
import eu.dnetlib.dhp.oa.provision.RelationList;
|
|
||||||
import eu.dnetlib.dhp.oa.provision.SortableRelation;
|
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
|
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.solr.*;
|
import eu.dnetlib.dhp.schema.solr.*;
|
||||||
import eu.dnetlib.dhp.schema.solr.AccessRight;
|
import eu.dnetlib.dhp.schema.solr.AccessRight;
|
||||||
import eu.dnetlib.dhp.schema.solr.Author;
|
import eu.dnetlib.dhp.schema.solr.Author;
|
||||||
|
@ -55,10 +52,7 @@ public class ProvisionModelSupport {
|
||||||
.newArrayList(
|
.newArrayList(
|
||||||
RelatedEntityWrapper.class,
|
RelatedEntityWrapper.class,
|
||||||
JoinedEntity.class,
|
JoinedEntity.class,
|
||||||
RelatedEntity.class,
|
RelatedEntity.class));
|
||||||
SortableRelationKey.class,
|
|
||||||
SortableRelation.class,
|
|
||||||
RelationList.class));
|
|
||||||
return modelClasses.toArray(new Class[] {});
|
return modelClasses.toArray(new Class[] {});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,7 +68,11 @@ public class ProvisionModelSupport {
|
||||||
.setHeader(
|
.setHeader(
|
||||||
SolrRecordHeader
|
SolrRecordHeader
|
||||||
.newInstance(
|
.newInstance(
|
||||||
e.getId(), e.getOriginalId(), type, deletedbyinference));
|
StringUtils
|
||||||
|
.substringAfter(
|
||||||
|
e.getId(),
|
||||||
|
IdentifierFactory.ID_PREFIX_SEPARATOR),
|
||||||
|
e.getOriginalId(), type, deletedbyinference));
|
||||||
r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
|
r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
|
||||||
r.setContext(asContext(e.getContext(), contextMapper));
|
r.setContext(asContext(e.getContext(), contextMapper));
|
||||||
r.setPid(asPid(e.getPid()));
|
r.setPid(asPid(e.getPid()));
|
||||||
|
@ -114,7 +112,8 @@ public class ProvisionModelSupport {
|
||||||
.newInstance(
|
.newInstance(
|
||||||
relation.getRelType(),
|
relation.getRelType(),
|
||||||
relation.getRelClass(),
|
relation.getRelClass(),
|
||||||
relation.getTarget(), relatedRecordType));
|
StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR),
|
||||||
|
relatedRecordType));
|
||||||
|
|
||||||
rr.setAcronym(re.getAcronym());
|
rr.setAcronym(re.getAcronym());
|
||||||
rr.setCode(re.getCode());
|
rr.setCode(re.getCode());
|
||||||
|
@ -147,6 +146,7 @@ public class ProvisionModelSupport {
|
||||||
ps.setContracttype(mapCodeLabel(p.getContracttype()));
|
ps.setContracttype(mapCodeLabel(p.getContracttype()));
|
||||||
ps.setCurrency(mapField(p.getCurrency()));
|
ps.setCurrency(mapField(p.getCurrency()));
|
||||||
ps.setDuration(mapField(p.getDuration()));
|
ps.setDuration(mapField(p.getDuration()));
|
||||||
|
ps.setOamandatepublications(mapField(p.getOamandatepublications()));
|
||||||
ps.setCallidentifier(mapField(p.getCallidentifier()));
|
ps.setCallidentifier(mapField(p.getCallidentifier()));
|
||||||
ps.setEcarticle29_3(mapField(p.getEcarticle29_3()));
|
ps.setEcarticle29_3(mapField(p.getEcarticle29_3()));
|
||||||
ps.setEnddate(mapField(p.getEnddate()));
|
ps.setEnddate(mapField(p.getEnddate()));
|
||||||
|
@ -387,7 +387,7 @@ public class ProvisionModelSupport {
|
||||||
.equals(
|
.equals(
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(t.getQualifier())
|
.ofNullable(t.getQualifier())
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassname)
|
||||||
.orElse(null)))
|
.orElse(null)))
|
||||||
.map(StructuredProperty::getValue)
|
.map(StructuredProperty::getValue)
|
||||||
.collect(Collectors.toList()))
|
.collect(Collectors.toList()))
|
||||||
|
@ -405,7 +405,7 @@ public class ProvisionModelSupport {
|
||||||
.equals(
|
.equals(
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(t.getQualifier())
|
.ofNullable(t.getQualifier())
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassname)
|
||||||
.orElse(null)))
|
.orElse(null)))
|
||||||
.map(StructuredProperty::getValue)
|
.map(StructuredProperty::getValue)
|
||||||
.findFirst())
|
.findFirst())
|
||||||
|
@ -472,7 +472,7 @@ public class ProvisionModelSupport {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String mapQualifier(eu.dnetlib.dhp.schema.oaf.Qualifier q) {
|
private static String mapQualifier(eu.dnetlib.dhp.schema.oaf.Qualifier q) {
|
||||||
return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null);
|
return Optional.ofNullable(q).map(Qualifier::getClassname).orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Journal mapJournal(eu.dnetlib.dhp.schema.oaf.Journal joaf) {
|
private static Journal mapJournal(eu.dnetlib.dhp.schema.oaf.Journal joaf) {
|
||||||
|
@ -581,7 +581,7 @@ public class ProvisionModelSupport {
|
||||||
.map(
|
.map(
|
||||||
pids -> pids
|
pids -> pids
|
||||||
.stream()
|
.stream()
|
||||||
.map(p -> Pid.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
.map(p -> Pid.newInstance(p.getQualifier().getClassname(), p.getValue()))
|
||||||
.collect(Collectors.toList()))
|
.collect(Collectors.toList()))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
@ -606,8 +606,8 @@ public class ProvisionModelSupport {
|
||||||
subjects -> subjects
|
subjects -> subjects
|
||||||
.stream()
|
.stream()
|
||||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassid()))
|
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
|
||||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid()))
|
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
|
||||||
.collect(Collectors.toList()))
|
.collect(Collectors.toList()))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
@ -619,8 +619,8 @@ public class ProvisionModelSupport {
|
||||||
subjects -> subjects
|
subjects -> subjects
|
||||||
.stream()
|
.stream()
|
||||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassid()))
|
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
|
||||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid()))
|
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
|
||||||
.collect(Collectors.toList()))
|
.collect(Collectors.toList()))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue