forked from D-Net/dnet-hadoop
Compare commits
135 Commits
Author | SHA1 | Date |
---|---|---|
Claudio Atzori | 38f8ed27fd | |
Claudio Atzori | 1fb44198fb | |
Claudio Atzori | 6f6e85ddf4 | |
Claudio Atzori | 7fa3d51200 | |
Michele Artini | f99fb21040 | |
Claudio Atzori | e17edb2581 | |
Claudio Atzori | 61d1fa9b9f | |
Claudio Atzori | f9ed2ae33c | |
Miriam Baglioni | 814e650e12 | |
Claudio Atzori | 1180d78b71 | |
Claudio Atzori | 7d3292551b | |
Claudio Atzori | c7634c55c7 | |
Michele De Bonis | a10e8d9f05 | |
Claudio Atzori | 14539f9c8b | |
Claudio Atzori | 1bc8c5d173 | |
Claudio Atzori | 1ccf01cdb8 | |
Claudio Atzori | b79cb155ba | |
Claudio Atzori | 33a02c5b9e | |
Claudio Atzori | 1182bca9eb | |
Claudio Atzori | 1c30eacac2 | |
Claudio Atzori | 6055212f77 | |
Claudio Atzori | 0031cf849e | |
Serafeim Chatzopoulos | 9f6e16a03c | |
Lampros Smyrnaios | 66cd28f70a | |
Lampros Smyrnaios | c6b1ab2a18 | |
Miriam Baglioni | d35edac212 | |
Miriam Baglioni | 6421f8fece | |
Miriam Baglioni | ac270f795b | |
Lampros Smyrnaios | 236aed8954 | |
Claudio Atzori | dd541f8cf5 | |
Lampros Smyrnaios | ff335578ea | |
Lampros Smyrnaios | 285416c74e | |
Lampros Smyrnaios | 3095047e5e | |
Antonis Lempesis | 0456f1b788 | |
Antonis Lempesis | 38636942c7 | |
Lampros Smyrnaios | d942a1101b | |
Giambattista Bloisi | 9bf2bda1c6 | |
Giambattista Bloisi | d90cb099b8 | |
Giambattista Bloisi | 4f2a61e10f | |
Claudio Atzori | 11fe3a4fe0 | |
Claudio Atzori | a8d68c9d29 | |
Miriam Baglioni | 8fe934810f | |
Miriam Baglioni | 9da006e98c | |
Giambattista Bloisi | 85c1eae7e0 | |
Claudio Atzori | b0eba210c0 | |
Claudio Atzori | 3776327a8c | |
Claudio Atzori | 0139f23d66 | |
Michele Artini | c726572418 | |
Claudio Atzori | ec79405cc9 | |
Miriam Baglioni | 1477406ecc | |
Claudio Atzori | 92c3abd5a4 | |
Claudio Atzori | ce2364743a | |
Claudio Atzori | f70dc76b61 | |
Claudio Atzori | 73bd1938a5 | |
Claudio Atzori | da5c1e73a4 | |
Claudio Atzori | a02f3f0d2b | |
Alessia Bardi | eadfd8d71d | |
Alessia Bardi | 05ee783c07 | |
Alessia Bardi | fe9fb59c90 | |
Claudio Atzori | c272c4ad68 | |
Alessia Bardi | c5f4da16a4 | |
Alessia | 1b165a14a0 | |
Michele Artini | e996787be2 | |
Claudio Atzori | 62716141c5 | |
Miriam Baglioni | 5d85b70e1f | |
Lampros Smyrnaios | e3f28338c1 | |
Giambattista Bloisi | 73316d8c83 | |
Miriam Baglioni | 75d5ddb999 | |
Miriam Baglioni | 87c9c61b41 | |
Miriam Baglioni | b55fed09f8 | |
Claudio Atzori | 107d958b89 | |
Claudio Atzori | 3a7a6ecc32 | |
Claudio Atzori | 1af4224d3d | |
Claudio Atzori | 0d5bdb2db0 | |
Claudio Atzori | 66548e6a83 | |
Antonis Lempesis | 15b54a345a | |
Lampros Smyrnaios | b48ed6e617 | |
Lampros Smyrnaios | 68322843e2 | |
Lampros Smyrnaios | c7b32bbacc | |
Giambattista Bloisi | 1b2357e10a | |
Sandro La Bruzzo | f1fe363b19 | |
Sandro La Bruzzo | 66c1ffc866 | |
Claudio Atzori | 1ea67eba82 | |
Claudio Atzori | f9fb2fef6e | |
Claudio Atzori | 834461ba26 | |
Sandro La Bruzzo | e8a61d5dd5 | |
Sandro La Bruzzo | ca9414b737 | |
Sandro La Bruzzo | 032bcc8279 | |
Sandro La Bruzzo | 103e2652b3 | |
Sandro La Bruzzo | a87f9ea643 | |
Sandro La Bruzzo | 6efab4d88e | |
Claudio Atzori | 92f018d196 | |
Claudio Atzori | 0611c81a2f | |
Claudio Atzori | 1efe7f7e39 | |
Claudio Atzori | 53e7bb4336 | |
Claudio Atzori | f7d56e2ef2 | |
Claudio Atzori | c1237ab39e | |
Claudio Atzori | dc3a5858f7 | |
Claudio Atzori | 55f39f7850 | |
Claudio Atzori | 39a2afe8b5 | |
Claudio Atzori | 908ed9da7a | |
Antonis Lempesis | 0cada3cc8f | |
Antonis Lempesis | 90a4fb3547 | |
Claudio Atzori | 18aa323ee9 | |
Claudio Atzori | b4e3389432 | |
Giambattista Bloisi | 711048ceed | |
Sandro La Bruzzo | db358ad0d2 | |
Sandro La Bruzzo | 26bf8e763a | |
Sandro La Bruzzo | a860c57bbc | |
Sandro La Bruzzo | 0646d0d064 | |
Michele Artini | f4068de298 | |
Michele Artini | 2615136efc | |
Sandro La Bruzzo | 133ead1e3e | |
Sandro La Bruzzo | 052c6aac9d | |
Sandro La Bruzzo | 9cd3bc0f10 | |
Sandro La Bruzzo | 0d628cd62b | |
Lampros Smyrnaios | 49af2e5740 | |
Antonis Lempesis | d2649a1429 | |
Sandro La Bruzzo | 073f320c6a | |
Sandro La Bruzzo | b84ad0c06e | |
Antonis Lempesis | b52a5a753b | |
Sandro La Bruzzo | 8dd9cf84e2 | |
Sandro La Bruzzo | 342cb6189b | |
Antonis Lempesis | c3fe9662b2 | |
Antonis Lempesis | 0c71c58df6 | |
Antonis Lempesis | 43d05dbebb | |
Antonis Lempesis | e728a0897c | |
Antonis Lempesis | 308ae580a9 | |
Antonis Lempesis | 27d22bd8f9 | |
Antonis Lempesis | 1f5aba12fa | |
Giambattista Bloisi | 613ec5ffce | |
Sandro La Bruzzo | 52495f2cd2 | |
Sandro La Bruzzo | 8c3e9a09d3 | |
Giambattista Bloisi | 2fa78f6071 | |
Giambattista Bloisi | 326c9dc08c |
|
@ -27,3 +27,4 @@ spark-warehouse
|
||||||
/**/.factorypath
|
/**/.factorypath
|
||||||
/**/.scalafmt.conf
|
/**/.scalafmt.conf
|
||||||
/.java-version
|
/.java-version
|
||||||
|
/dhp-shade-package/dependency-reduced-pom.xml
|
||||||
|
|
|
@ -80,7 +80,15 @@ class WritePredefinedProjectPropertiesTest {
|
||||||
mojo.outputFile = testFolder;
|
mojo.outputFile = testFolder;
|
||||||
|
|
||||||
// execute
|
// execute
|
||||||
Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute());
|
try {
|
||||||
|
mojo.execute();
|
||||||
|
Assertions.assertTrue(false); // not reached
|
||||||
|
} catch (Exception e) {
|
||||||
|
Assertions
|
||||||
|
.assertTrue(
|
||||||
|
MojoExecutionException.class.isAssignableFrom(e.getClass()) ||
|
||||||
|
IllegalArgumentException.class.isAssignableFrom(e.getClass()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -70,10 +70,7 @@
|
||||||
<groupId>com.ibm.icu</groupId>
|
<groupId>com.ibm.icu</groupId>
|
||||||
<artifactId>icu4j</artifactId>
|
<artifactId>icu4j</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.hadoop</groupId>
|
|
||||||
<artifactId>hadoop-common</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
|
@ -163,7 +160,7 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>${dhp-schemas.artifact}</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -172,4 +169,23 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<!-- dependencies required on JDK9+ because J2EE has been removed -->
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>javax.xml.bind</groupId>
|
||||||
|
<artifactId>jaxb-api</artifactId>
|
||||||
|
<version>2.2.11</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.sun.xml.ws</groupId>
|
||||||
|
<artifactId>jaxws-ri</artifactId>
|
||||||
|
<version>2.3.3</version>
|
||||||
|
<type>pom</type>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class PacePerson {
|
||||||
PacePerson.class
|
PacePerson.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
"/eu/dnetlib/dhp/common/name_particles.txt")));
|
||||||
} catch (IOException e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -217,8 +217,6 @@ public class ZenodoAPIClient implements Serializable {
|
||||||
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
|
||||||
* concept_rec_id = 656930
|
* concept_rec_id = 656930
|
||||||
* @return response code
|
* @return response code
|
||||||
* @throws IOException
|
|
||||||
* @throws MissingConceptDoiException
|
|
||||||
*/
|
*/
|
||||||
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
|
||||||
setDepositionId(concept_rec_id, 1);
|
setDepositionId(concept_rec_id, 1);
|
||||||
|
|
|
@ -12,9 +12,7 @@ import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.math.NumberUtils;
|
import org.apache.commons.lang3.math.NumberUtils;
|
||||||
import org.apache.commons.lang3.time.DateUtils;
|
|
||||||
import org.apache.http.HttpHeaders;
|
import org.apache.http.HttpHeaders;
|
||||||
import org.joda.time.Instant;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,106 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
public class MergeEntitiesComparator implements Comparator<Oaf> {
|
||||||
|
static final List<String> PID_AUTHORITIES = Arrays
|
||||||
|
.asList(
|
||||||
|
ModelConstants.ARXIV_ID,
|
||||||
|
ModelConstants.PUBMED_CENTRAL_ID,
|
||||||
|
ModelConstants.EUROPE_PUBMED_CENTRAL_ID,
|
||||||
|
ModelConstants.DATACITE_ID,
|
||||||
|
ModelConstants.CROSSREF_ID);
|
||||||
|
|
||||||
|
static final List<String> RESULT_TYPES = Arrays
|
||||||
|
.asList(
|
||||||
|
ModelConstants.ORP_RESULTTYPE_CLASSID,
|
||||||
|
ModelConstants.SOFTWARE_RESULTTYPE_CLASSID,
|
||||||
|
ModelConstants.DATASET_RESULTTYPE_CLASSID,
|
||||||
|
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||||
|
|
||||||
|
public static final Comparator<Oaf> INSTANCE = new MergeEntitiesComparator();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(Oaf left, Oaf right) {
|
||||||
|
if (left == null && right == null)
|
||||||
|
return 0;
|
||||||
|
if (left == null)
|
||||||
|
return -1;
|
||||||
|
if (right == null)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
int res = 0;
|
||||||
|
|
||||||
|
// pid authority
|
||||||
|
int cfp1 = Optional
|
||||||
|
.ofNullable(left.getCollectedfrom())
|
||||||
|
.map(
|
||||||
|
cf -> cf
|
||||||
|
.stream()
|
||||||
|
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
|
||||||
|
.max(Integer::compare)
|
||||||
|
.orElse(-1))
|
||||||
|
.orElse(-1);
|
||||||
|
int cfp2 = Optional
|
||||||
|
.ofNullable(right.getCollectedfrom())
|
||||||
|
.map(
|
||||||
|
cf -> cf
|
||||||
|
.stream()
|
||||||
|
.map(kv -> PID_AUTHORITIES.indexOf(kv.getKey()))
|
||||||
|
.max(Integer::compare)
|
||||||
|
.orElse(-1))
|
||||||
|
.orElse(-1);
|
||||||
|
|
||||||
|
if (cfp1 >= 0 && cfp1 > cfp2) {
|
||||||
|
return 1;
|
||||||
|
} else if (cfp2 >= 0 && cfp2 > cfp1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// trust
|
||||||
|
if (left.getDataInfo() != null && right.getDataInfo() != null) {
|
||||||
|
res = left.getDataInfo().getTrust().compareTo(right.getDataInfo().getTrust());
|
||||||
|
}
|
||||||
|
|
||||||
|
// result type
|
||||||
|
if (res == 0) {
|
||||||
|
if (left instanceof Result && right instanceof Result) {
|
||||||
|
Result r1 = (Result) left;
|
||||||
|
Result r2 = (Result) right;
|
||||||
|
|
||||||
|
if (r1.getResulttype() == null || r1.getResulttype().getClassid() == null) {
|
||||||
|
if (r2.getResulttype() != null && r2.getResulttype().getClassid() != null) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else if (r2.getResulttype() == null || r2.getResulttype().getClassid() == null) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int rt1 = RESULT_TYPES.indexOf(r1.getResulttype().getClassid());
|
||||||
|
int rt2 = RESULT_TYPES.indexOf(r2.getResulttype().getClassid());
|
||||||
|
|
||||||
|
if (rt1 >= 0 && rt1 > rt2) {
|
||||||
|
return 1;
|
||||||
|
} else if (rt2 >= 0 && rt2 > rt1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// id
|
||||||
|
if (res == 0) {
|
||||||
|
if (left instanceof OafEntity && right instanceof OafEntity) {
|
||||||
|
res = ((OafEntity) left).getId().compareTo(((OafEntity) right).getId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -40,27 +40,12 @@ public class MergeUtils {
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
||||||
boolean checkDelegateAuthority) {
|
boolean checkDelegateAuthority) {
|
||||||
TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
|
|
||||||
int res = 0;
|
|
||||||
|
|
||||||
if (o1.getDataInfo() != null && o2.getDataInfo() != null) {
|
ArrayList<T> sortedEntities = new ArrayList<>();
|
||||||
res = o1.getDataInfo().getTrust().compareTo(o2.getDataInfo().getTrust());
|
oafEntityIterator.forEachRemaining(sortedEntities::add);
|
||||||
}
|
sortedEntities.sort(MergeEntitiesComparator.INSTANCE.reversed());
|
||||||
|
|
||||||
if (res == 0) {
|
Iterator<T> it = sortedEntities.iterator();
|
||||||
if (o1 instanceof Result && o2 instanceof Result) {
|
|
||||||
return ResultTypeComparator.INSTANCE.compare((Result) o1, (Result) o2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return res;
|
|
||||||
});
|
|
||||||
|
|
||||||
while (oafEntityIterator.hasNext()) {
|
|
||||||
sortedEntities.add(oafEntityIterator.next());
|
|
||||||
}
|
|
||||||
|
|
||||||
Iterator<T> it = sortedEntities.descendingIterator();
|
|
||||||
T merged = it.next();
|
T merged = it.next();
|
||||||
|
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
|
@ -143,7 +128,7 @@ public class MergeUtils {
|
||||||
* https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
|
* https://graph.openaire.eu/docs/data-model/pids-and-identifiers#delegated-authorities and in that case it prefers
|
||||||
* such version.
|
* such version.
|
||||||
* <p>
|
* <p>
|
||||||
* Otherwise, it considers a resulttype priority order implemented in {@link ResultTypeComparator}
|
* Otherwise, it considers a resulttype priority order implemented in {@link MergeEntitiesComparator}
|
||||||
* and proceeds with the canonical property merging.
|
* and proceeds with the canonical property merging.
|
||||||
*
|
*
|
||||||
* @param left
|
* @param left
|
||||||
|
@ -161,8 +146,9 @@ public class MergeUtils {
|
||||||
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
|
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
|
||||||
return right;
|
return right;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: raise trust to have preferred fields from one or the other??
|
// TODO: raise trust to have preferred fields from one or the other??
|
||||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
if (MergeEntitiesComparator.INSTANCE.compare(left, right) > 0) {
|
||||||
return mergeResultFields(left, right);
|
return mergeResultFields(left, right);
|
||||||
} else {
|
} else {
|
||||||
return mergeResultFields(right, left);
|
return mergeResultFields(right, left);
|
||||||
|
@ -225,9 +211,9 @@ public class MergeUtils {
|
||||||
|
|
||||||
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
|
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
|
||||||
Function<T, K> keyExtractor, BinaryOperator<T> merger) {
|
Function<T, K> keyExtractor, BinaryOperator<T> merger) {
|
||||||
if (left == null) {
|
if (left == null || left.isEmpty()) {
|
||||||
return right;
|
return right != null ? right : new ArrayList<>();
|
||||||
} else if (right == null) {
|
} else if (right == null || right.isEmpty()) {
|
||||||
return left;
|
return left;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -342,7 +328,7 @@ public class MergeUtils {
|
||||||
final T merged = mergeOafFields(original, enrich, trust);
|
final T merged = mergeOafFields(original, enrich, trust);
|
||||||
|
|
||||||
merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
|
merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
|
||||||
merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust));
|
merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1));
|
||||||
merged.setDateofcollection(LocalDateTime.now().toString());
|
merged.setDateofcollection(LocalDateTime.now().toString());
|
||||||
merged
|
merged
|
||||||
.setDateoftransformation(
|
.setDateoftransformation(
|
||||||
|
@ -405,7 +391,7 @@ public class MergeUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
// should be an instance attribute, get the first non-null value
|
// should be an instance attribute, get the first non-null value
|
||||||
merge.setLanguage(coalesce(merge.getLanguage(), enrich.getLanguage()));
|
merge.setLanguage(coalesceQualifier(merge.getLanguage(), enrich.getLanguage()));
|
||||||
|
|
||||||
// distinct countries, do not manage datainfo
|
// distinct countries, do not manage datainfo
|
||||||
merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
|
merge.setCountry(mergeQualifiers(merge.getCountry(), enrich.getCountry(), trust));
|
||||||
|
@ -575,6 +561,13 @@ public class MergeUtils {
|
||||||
return m != null ? m : e;
|
return m != null ? m : e;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Qualifier coalesceQualifier(Qualifier m, Qualifier e) {
|
||||||
|
if (m == null || m.getClassid() == null || StringUtils.isBlank(m.getClassid())) {
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
|
private static List<Author> mergeAuthors(List<Author> author, List<Author> author1, int trust) {
|
||||||
List<List<Author>> authors = new ArrayList<>();
|
List<List<Author>> authors = new ArrayList<>();
|
||||||
if (author != null) {
|
if (author != null) {
|
||||||
|
@ -587,6 +580,10 @@ public class MergeUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String instanceKeyExtractor(Instance i) {
|
private static String instanceKeyExtractor(Instance i) {
|
||||||
|
// three levels of concatenating:
|
||||||
|
// 1. ::
|
||||||
|
// 2. @@
|
||||||
|
// 3. ||
|
||||||
return String
|
return String
|
||||||
.join(
|
.join(
|
||||||
"::",
|
"::",
|
||||||
|
@ -594,10 +591,10 @@ public class MergeUtils {
|
||||||
kvKeyExtractor(i.getCollectedfrom()),
|
kvKeyExtractor(i.getCollectedfrom()),
|
||||||
qualifierKeyExtractor(i.getAccessright()),
|
qualifierKeyExtractor(i.getAccessright()),
|
||||||
qualifierKeyExtractor(i.getInstancetype()),
|
qualifierKeyExtractor(i.getInstancetype()),
|
||||||
Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
|
Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null),
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(i.getPid())
|
.ofNullable(i.getPid())
|
||||||
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
|
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@")))
|
||||||
.orElse(null));
|
.orElse(null));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -658,6 +655,13 @@ public class MergeUtils {
|
||||||
return d1;
|
return d1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (StringUtils.contains(d1.getValue(), "null")) {
|
||||||
|
return d2;
|
||||||
|
}
|
||||||
|
if (StringUtils.contains(d2.getValue(), "null")) {
|
||||||
|
return d1;
|
||||||
|
}
|
||||||
|
|
||||||
return Stream
|
return Stream
|
||||||
.of(d1, d2)
|
.of(d1, d2)
|
||||||
.min(
|
.min(
|
||||||
|
@ -706,7 +710,7 @@ public class MergeUtils {
|
||||||
private static String spKeyExtractor(StructuredProperty sp) {
|
private static String spKeyExtractor(StructuredProperty sp) {
|
||||||
return Optional
|
return Optional
|
||||||
.ofNullable(sp)
|
.ofNullable(sp)
|
||||||
.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
|
.map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue()))
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,87 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
|
|
||||||
public class ResultTypeComparator implements Comparator<Result> {
|
|
||||||
|
|
||||||
public static final ResultTypeComparator INSTANCE = new ResultTypeComparator();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(Result left, Result right) {
|
|
||||||
|
|
||||||
if (left == null && right == null)
|
|
||||||
return 0;
|
|
||||||
if (left == null)
|
|
||||||
return 1;
|
|
||||||
if (right == null)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
HashSet<String> lCf = getCollectedFromIds(left);
|
|
||||||
HashSet<String> rCf = getCollectedFromIds(right);
|
|
||||||
|
|
||||||
if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
|
|
||||||
if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return 1;
|
|
||||||
} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
String lClass = left.getResulttype().getClassid();
|
|
||||||
String rClass = right.getResulttype().getClassid();
|
|
||||||
|
|
||||||
if (!lClass.equals(rClass)) {
|
|
||||||
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Else (but unlikely), lexicographical ordering will do.
|
|
||||||
return lClass.compareTo(rClass);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected HashSet<String> getCollectedFromIds(Result left) {
|
|
||||||
return Optional
|
|
||||||
.ofNullable(left.getCollectedfrom())
|
|
||||||
.map(
|
|
||||||
cf -> cf
|
|
||||||
.stream()
|
|
||||||
.map(KeyValue::getKey)
|
|
||||||
.collect(Collectors.toCollection(HashSet::new)))
|
|
||||||
.orElse(new HashSet<>());
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -154,5 +154,13 @@
|
||||||
"unknown":{
|
"unknown":{
|
||||||
"original":"Unknown",
|
"original":"Unknown",
|
||||||
"inverse":"Unknown"
|
"inverse":"Unknown"
|
||||||
|
},
|
||||||
|
"isamongtopnsimilardocuments": {
|
||||||
|
"original": "IsAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "HasAmongTopNSimilarDocuments"
|
||||||
|
},
|
||||||
|
"hasamongtopnsimilardocuments": {
|
||||||
|
"original": "HasAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "IsAmongTopNSimilarDocuments"
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -65,12 +65,13 @@ abstract class AbstractScalaApplication(
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val master = parser.get("master")
|
val master = parser.get("master")
|
||||||
log.info(s"Creating Spark session: Master: $master")
|
log.info(s"Creating Spark session: Master: $master")
|
||||||
SparkSession
|
val b = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.config(conf)
|
.config(conf)
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(master)
|
if (master != null)
|
||||||
.getOrCreate()
|
b.master(master)
|
||||||
|
b.getOrCreate()
|
||||||
}
|
}
|
||||||
|
|
||||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||||
|
|
|
@ -65,7 +65,11 @@ object ScholixUtils extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||||
|
val sum = ScholixUtils.resultToSummary(r)
|
||||||
|
if (sum != null)
|
||||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||||
|
else
|
||||||
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||||
|
@ -153,6 +157,14 @@ object ScholixUtils extends Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def invRel(rel: String): String = {
|
||||||
|
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
|
||||||
|
if (semanticRelation != null)
|
||||||
|
semanticRelation.inverse
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||||
|
@ -377,10 +389,7 @@ object ScholixUtils extends Serializable {
|
||||||
if (persistentIdentifiers.isEmpty)
|
if (persistentIdentifiers.isEmpty)
|
||||||
return null
|
return null
|
||||||
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
||||||
if (r.isInstanceOf[Publication])
|
// s.setTypology(r.getResulttype.getClassid)
|
||||||
s.setTypology(Typology.publication)
|
|
||||||
else
|
|
||||||
s.setTypology(Typology.dataset)
|
|
||||||
|
|
||||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
<phase>initialize</phase>
|
<phase>process-resources</phase>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>add-source</goal>
|
<goal>add-source</goal>
|
||||||
<goal>compile</goal>
|
<goal>compile</goal>
|
||||||
|
@ -59,14 +59,6 @@
|
||||||
<groupId>edu.cmu</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>secondstring</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.guava</groupId>
|
|
||||||
<artifactId>guava</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.code.gson</groupId>
|
|
||||||
<artifactId>gson</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
@ -91,10 +83,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.commons</groupId>
|
|
||||||
<artifactId>commons-math3</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
|
@ -113,4 +101,90 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-24</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-2</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-2</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-35</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/spark-35</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -1,12 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.common;
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import com.ibm.icu.text.Transliterator;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
@ -15,6 +9,13 @@ import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions for the framework
|
* Set of common functions for the framework
|
||||||
*
|
*
|
||||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.pace.model
|
||||||
import com.jayway.jsonpath.{Configuration, JsonPath}
|
import com.jayway.jsonpath.{Configuration, JsonPath}
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
||||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil
|
import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils}
|
||||||
import org.apache.commons.lang3.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||||
|
@ -52,7 +52,7 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||||
|
|
||||||
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||||
df.map(r => rowFromJson(r))(RowEncoder(schema))
|
df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema))
|
||||||
}
|
}
|
||||||
|
|
||||||
def rowFromJson(json: String): Row = {
|
def rowFromJson(json: String): Row = {
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ComparatorClass("countryMatch")
|
||||||
|
public class CountryMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
|
public CountryMatch(Map<String, String> params) {
|
||||||
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
|
public CountryMatch(final double weight) {
|
||||||
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||||
|
super(weight, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
|
return -1.0; // return -1 if a field is missing
|
||||||
|
}
|
||||||
|
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
|
||||||
|
return -1.0; // return -1 if a country is UNKNOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
return a.equals(b) ? 1.0 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getWeight() {
|
||||||
|
return super.weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected double normalize(final double d) {
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.pace.util
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
object SparkCompatUtils {
|
||||||
|
|
||||||
|
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||||
|
RowEncoder(schema)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
package eu.dnetlib.pace.util
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
object SparkCompatUtils {
|
||||||
|
|
||||||
|
def encoderFor(schema: StructType): ExpressionEncoder[Row] = {
|
||||||
|
ExpressionEncoder(schema)
|
||||||
|
}
|
||||||
|
}
|
|
@ -336,4 +336,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
System.out.println("compare = " + compare);
|
System.out.println("compare = " + compare);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void countryMatch() {
|
||||||
|
|
||||||
|
CountryMatch countryMatch = new CountryMatch(params);
|
||||||
|
|
||||||
|
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
||||||
|
assertEquals(-1.0, result);
|
||||||
|
|
||||||
|
result = countryMatch.distance("CHILE", "UNKNOWN", conf);
|
||||||
|
assertEquals(-1.0, result);
|
||||||
|
|
||||||
|
result = countryMatch.distance("CHILE", "ITALY", conf);
|
||||||
|
assertEquals(0.0, result);
|
||||||
|
|
||||||
|
result = countryMatch.distance("CHILE", "CHILE", conf);
|
||||||
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,6 +11,7 @@ import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class UtilTest {
|
public class UtilTest {
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
|
<parent>
|
||||||
|
<artifactId>dhp</artifactId>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<version>1.2.5-SNAPSHOT</version>
|
||||||
|
</parent>
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<artifactId>dhp-shade-package</artifactId>
|
||||||
|
<description>This module create a jar of all module dependencies</description>
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-shade-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>shade</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<transformers>
|
||||||
|
<transformer>
|
||||||
|
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
|
||||||
|
</transformer>
|
||||||
|
<transformer />
|
||||||
|
<transformer>
|
||||||
|
<resource>META-INF/cxf/bus-extensions.txt</resource>
|
||||||
|
</transformer>
|
||||||
|
</transformers>
|
||||||
|
<filters>
|
||||||
|
<filter>
|
||||||
|
<artifact>*:*</artifact>
|
||||||
|
<excludes>
|
||||||
|
<exclude>META-INF/maven/**</exclude>
|
||||||
|
<exclude>META-INF/*.SF</exclude>
|
||||||
|
<exclude>META-INF/*.DSA</exclude>
|
||||||
|
<exclude>META-INF/*.RSA</exclude>
|
||||||
|
</excludes>
|
||||||
|
</filter>
|
||||||
|
</filters>
|
||||||
|
<relocations>
|
||||||
|
<relocation>
|
||||||
|
<pattern>com</pattern>
|
||||||
|
<shadedPattern>repackaged.com.google.common</shadedPattern>
|
||||||
|
<includes>
|
||||||
|
<include>com.google.common.**</include>
|
||||||
|
</includes>
|
||||||
|
</relocation>
|
||||||
|
</relocations>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.projectlombok</groupId>
|
||||||
|
<artifactId>lombok</artifactId>
|
||||||
|
<version>1.18.28</version>
|
||||||
|
<scope>provided</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.jupiter</groupId>
|
||||||
|
<artifactId>junit-jupiter</artifactId>
|
||||||
|
<version>5.6.1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>junit-jupiter-api</artifactId>
|
||||||
|
<groupId>org.junit.jupiter</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>junit-jupiter-params</artifactId>
|
||||||
|
<groupId>org.junit.jupiter</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>junit-jupiter-engine</artifactId>
|
||||||
|
<groupId>org.junit.jupiter</groupId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-core</artifactId>
|
||||||
|
<version>3.3.3</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>byte-buddy</artifactId>
|
||||||
|
<groupId>net.bytebuddy</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>byte-buddy-agent</artifactId>
|
||||||
|
<groupId>net.bytebuddy</groupId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.mockito</groupId>
|
||||||
|
<artifactId>mockito-junit-jupiter</artifactId>
|
||||||
|
<version>3.3.3</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
<distributionManagement>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||||
|
</site>
|
||||||
|
</distributionManagement>
|
||||||
|
</project>
|
|
@ -0,0 +1,169 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<parent>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp</artifactId>
|
||||||
|
<version>1.2.5-SNAPSHOT</version>
|
||||||
|
<relativePath>../pom.xml</relativePath>
|
||||||
|
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>dhp-shade-package</artifactId>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<distributionManagement>
|
||||||
|
<site>
|
||||||
|
<id>DHPSite</id>
|
||||||
|
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||||
|
</site>
|
||||||
|
</distributionManagement>
|
||||||
|
|
||||||
|
<description>This module create a jar of all module dependencies</description>
|
||||||
|
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-actionmanager</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-aggregation</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-blacklist</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-broker-events</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-dedup-openaire</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<!-- <dependency>-->
|
||||||
|
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||||
|
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||||
|
<!-- <version>${project.version}</version>-->
|
||||||
|
<!-- </dependency>-->
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-graph-mapper</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-graph-provision</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-impact-indicators</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-actionsets</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-promote</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-stats-update</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-swh</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-usage-raw-data-update</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-usage-stats-build</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-shade-plugin</artifactId>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>package</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>shade</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<transformers>
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||||
|
<mainClass>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</mainClass>
|
||||||
|
</transformer>
|
||||||
|
<!-- This is needed if you have dependencies that use Service Loader. Most Google Cloud client libraries do. -->
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
||||||
|
<transformer
|
||||||
|
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
|
||||||
|
<resource>META-INF/cxf/bus-extensions.txt</resource>
|
||||||
|
</transformer>
|
||||||
|
</transformers>
|
||||||
|
<filters>
|
||||||
|
<filter>
|
||||||
|
<artifact>*:*</artifact>
|
||||||
|
<excludes>
|
||||||
|
<exclude>META-INF/maven/**</exclude>
|
||||||
|
<exclude>META-INF/*.SF</exclude>
|
||||||
|
<exclude>META-INF/*.DSA</exclude>
|
||||||
|
<exclude>META-INF/*.RSA</exclude>
|
||||||
|
</excludes>
|
||||||
|
</filter>
|
||||||
|
</filters>
|
||||||
|
<relocations>
|
||||||
|
<relocation>
|
||||||
|
<pattern>com</pattern>
|
||||||
|
<shadedPattern>repackaged.com.google.common</shadedPattern>
|
||||||
|
<includes>
|
||||||
|
<include>com.google.common.**</include>
|
||||||
|
</includes>
|
||||||
|
</relocation>
|
||||||
|
</relocations>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
|
@ -103,6 +103,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -156,6 +157,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -95,6 +95,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -125,6 +125,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -95,6 +95,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -103,6 +103,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -155,11 +156,12 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=2560
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
<arg>--inputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
|
|
@ -95,6 +95,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -103,11 +103,12 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
|
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
@ -156,11 +157,12 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
|
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
|
|
@ -95,11 +95,12 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=10000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
|
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||||
|
|
|
@ -103,6 +103,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -155,11 +156,12 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=2560
|
--conf spark.sql.shuffle.partitions=4000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
|
<arg>--inputGraphTablePath</arg><arg>${workingDir}/software</arg>
|
||||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
|
|
@ -9,6 +9,7 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -106,7 +107,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
.union(openAPCRelations)
|
.union(openAPCRelations)
|
||||||
.union(dataciteRelations)
|
.union(dataciteRelations)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.BZip2Codec;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -83,7 +84,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
resultsRDD
|
resultsRDD
|
||||||
.union(projectsRDD)
|
.union(projectsRDD)
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -115,19 +115,7 @@ public class PrepareFOSSparkJob implements Serializable {
|
||||||
.forEach(
|
.forEach(
|
||||||
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
|
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
|
||||||
r.setSubject(sbjs);
|
r.setSubject(sbjs);
|
||||||
r
|
|
||||||
.setDataInfo(
|
|
||||||
OafMapperUtils
|
|
||||||
.dataInfo(
|
|
||||||
false, null, true,
|
|
||||||
false,
|
|
||||||
OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
ModelConstants.PROVENANCE_ENRICH,
|
|
||||||
null,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
|
||||||
null));
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -81,19 +81,7 @@ public class PrepareSDGSparkJob implements Serializable {
|
||||||
s -> sbjs
|
s -> sbjs
|
||||||
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
|
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
|
||||||
r.setSubject(sbjs);
|
r.setSubject(sbjs);
|
||||||
r
|
|
||||||
.setDataInfo(
|
|
||||||
OafMapperUtils
|
|
||||||
.dataInfo(
|
|
||||||
false, null, true,
|
|
||||||
false,
|
|
||||||
OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
ModelConstants.PROVENANCE_ENRICH,
|
|
||||||
null,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
|
||||||
null));
|
|
||||||
return r;
|
return r;
|
||||||
}, Encoders.bean(Result.class))
|
}, Encoders.bean(Result.class))
|
||||||
.write()
|
.write()
|
||||||
|
|
|
@ -12,6 +12,7 @@ import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
|
@ -70,6 +71,9 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String blackListInputPath = parser.get("blackListPath");
|
||||||
|
log.info("blackListInputPath: {}", blackListInputPath);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -77,19 +81,25 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
||||||
createActionSet(spark, inputPath, outputPath);
|
createActionSet(spark, inputPath, outputPath, blackListInputPath);
|
||||||
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void createActionSet(SparkSession spark, String inputPath,
|
public static void createActionSet(SparkSession spark, String inputPath,
|
||||||
String outputPath) {
|
String outputPath, String blackListInputPath) {
|
||||||
|
|
||||||
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
final Dataset<Row> dataset = readWebCrawl(spark, inputPath)
|
||||||
.filter("publication_year <= 2020 or country_code=='IE'")
|
.filter("country_code=='IE'")
|
||||||
.drop("publication_year");
|
.drop("publication_year");
|
||||||
|
|
||||||
dataset.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
final Dataset<Row> blackList = readBlackList(spark, blackListInputPath);
|
||||||
|
|
||||||
|
dataset
|
||||||
|
.join(blackList, dataset.col("id").equalTo(blackList.col("OpenAlexId")), "left")
|
||||||
|
.filter((FilterFunction<Row>) r -> r.getAs("OpenAlexId") == null)
|
||||||
|
.drop("OpenAlexId")
|
||||||
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||||
List<Relation> ret = new ArrayList<>();
|
List<Relation> ret = new ArrayList<>();
|
||||||
final String ror = ROR_PREFIX
|
final String ror = ROR_PREFIX
|
||||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
|
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
|
||||||
|
@ -136,6 +146,15 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
|
||||||
|
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.option("header", true)
|
||||||
|
.csv(inputPath)
|
||||||
|
.select("OpenAlexId");
|
||||||
|
}
|
||||||
|
|
||||||
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
|
private static List<Relation> createAffiliationRelationPairPMCID(String pmcid, String ror) {
|
||||||
if (pmcid == null)
|
if (pmcid == null)
|
||||||
return new ArrayList<>();
|
return new ArrayList<>();
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Spliterator;
|
import java.util.Spliterator;
|
||||||
import java.util.Spliterators;
|
import java.util.Spliterators;
|
||||||
|
@ -9,6 +10,8 @@ import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
@ -47,6 +50,9 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
final String entityXpath = api.getParams().get("entityXpath");
|
final String entityXpath = api.getParams().get("entityXpath");
|
||||||
final String authMethod = api.getParams().get("authMethod");
|
final String authMethod = api.getParams().get("authMethod");
|
||||||
final String authToken = api.getParams().get("authToken");
|
final String authToken = api.getParams().get("authToken");
|
||||||
|
final String requestHeaderMap = api.getParams().get("requestHeaderMap");
|
||||||
|
Gson gson = new Gson();
|
||||||
|
Map requestHeaders = gson.fromJson(requestHeaderMap, Map.class);
|
||||||
final String resultSizeValue = Optional
|
final String resultSizeValue = Optional
|
||||||
.ofNullable(api.getParams().get("resultSizeValue"))
|
.ofNullable(api.getParams().get("resultSizeValue"))
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
@ -64,9 +70,6 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
if (StringUtils.isBlank(resultFormatValue)) {
|
if (StringUtils.isBlank(resultFormatValue)) {
|
||||||
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
throw new CollectorException("Param 'resultFormatValue' is null or empty");
|
||||||
}
|
}
|
||||||
if (StringUtils.isBlank(queryParams)) {
|
|
||||||
throw new CollectorException("Param 'queryParams' is null or empty");
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(entityXpath)) {
|
if (StringUtils.isBlank(entityXpath)) {
|
||||||
throw new CollectorException("Param 'entityXpath' is null or empty");
|
throw new CollectorException("Param 'entityXpath' is null or empty");
|
||||||
}
|
}
|
||||||
|
@ -92,7 +95,8 @@ public class RestCollectorPlugin implements CollectorPlugin {
|
||||||
entityXpath,
|
entityXpath,
|
||||||
authMethod,
|
authMethod,
|
||||||
authToken,
|
authToken,
|
||||||
resultOutputFormat);
|
resultOutputFormat,
|
||||||
|
requestHeaders);
|
||||||
|
|
||||||
return StreamSupport
|
return StreamSupport
|
||||||
.stream(
|
.stream(
|
||||||
|
|
|
@ -9,8 +9,11 @@ import java.net.URL;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import javax.xml.transform.OutputKeys;
|
import javax.xml.transform.OutputKeys;
|
||||||
import javax.xml.transform.Transformer;
|
import javax.xml.transform.Transformer;
|
||||||
|
@ -22,14 +25,14 @@ import javax.xml.xpath.*;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.http.HttpHeaders;
|
|
||||||
import org.apache.http.entity.ContentType;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.w3c.dom.Node;
|
import org.w3c.dom.Node;
|
||||||
import org.w3c.dom.NodeList;
|
import org.w3c.dom.NodeList;
|
||||||
import org.xml.sax.InputSource;
|
import org.xml.sax.InputSource;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
|
@ -44,23 +47,28 @@ import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class RestIterator implements Iterator<String> {
|
public class RestIterator implements Iterator<String> {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
||||||
public static final String UTF_8 = "UTF-8";
|
public static final String UTF_8 = "UTF-8";
|
||||||
|
private static final int MAX_ATTEMPTS = 5;
|
||||||
|
|
||||||
private final HttpClientParams clientParams;
|
private final HttpClientParams clientParams;
|
||||||
|
|
||||||
private final String BASIC = "basic";
|
private final String AUTHBASIC = "basic";
|
||||||
|
|
||||||
|
private static final String XML_HEADER = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||||
|
private static final String EMPTY_XML = XML_HEADER + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG
|
||||||
|
+ ">";
|
||||||
|
|
||||||
private final String baseUrl;
|
private final String baseUrl;
|
||||||
private final String resumptionType;
|
private final String resumptionType;
|
||||||
private final String resumptionParam;
|
private final String resumptionParam;
|
||||||
private final String resultFormatValue;
|
private final String resultFormatValue;
|
||||||
private String queryParams;
|
private String queryParams = "";
|
||||||
private final int resultSizeValue;
|
private final int resultSizeValue;
|
||||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||||
private int resultTotal = -1;
|
private int resultTotal = -1;
|
||||||
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
|
private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
|
||||||
|
// harvest
|
||||||
// or token scanned from results)
|
// or token scanned from results)
|
||||||
private InputStream resultStream;
|
private InputStream resultStream;
|
||||||
private Transformer transformer;
|
private Transformer transformer;
|
||||||
|
@ -73,7 +81,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
private final String querySize;
|
private final String querySize;
|
||||||
private final String authMethod;
|
private final String authMethod;
|
||||||
private final String authToken;
|
private final String authToken;
|
||||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
||||||
private int discoverResultSize = 0;
|
private int discoverResultSize = 0;
|
||||||
private int pagination = 1;
|
private int pagination = 1;
|
||||||
/*
|
/*
|
||||||
|
@ -83,8 +91,13 @@ public class RestIterator implements Iterator<String> {
|
||||||
*/
|
*/
|
||||||
private final String resultOutputFormat;
|
private final String resultOutputFormat;
|
||||||
|
|
||||||
/** RestIterator class
|
/*
|
||||||
* compatible to version 1.3.33
|
* Can be used to set additional request headers, like for content negotiation
|
||||||
|
*/
|
||||||
|
private Map<String, String> requestHeaders;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RestIterator class compatible to version 1.3.33
|
||||||
*/
|
*/
|
||||||
public RestIterator(
|
public RestIterator(
|
||||||
final HttpClientParams clientParams,
|
final HttpClientParams clientParams,
|
||||||
|
@ -101,47 +114,56 @@ public class RestIterator implements Iterator<String> {
|
||||||
final String entityXpath,
|
final String entityXpath,
|
||||||
final String authMethod,
|
final String authMethod,
|
||||||
final String authToken,
|
final String authToken,
|
||||||
final String resultOutputFormat) {
|
final String resultOutputFormat,
|
||||||
|
final Map<String, String> requestHeaders) {
|
||||||
|
|
||||||
this.clientParams = clientParams;
|
this.clientParams = clientParams;
|
||||||
this.baseUrl = baseUrl;
|
this.baseUrl = baseUrl;
|
||||||
this.resumptionType = resumptionType;
|
this.resumptionType = resumptionType;
|
||||||
this.resumptionParam = resumptionParam;
|
this.resumptionParam = resumptionParam;
|
||||||
this.resultFormatValue = resultFormatValue;
|
this.resultFormatValue = resultFormatValue;
|
||||||
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
|
this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
|
||||||
this.queryParams = queryParams;
|
this.queryParams = queryParams;
|
||||||
this.authMethod = authMethod;
|
this.authMethod = authMethod;
|
||||||
this.authToken = authToken;
|
this.authToken = authToken;
|
||||||
this.resultOutputFormat = resultOutputFormat;
|
this.resultOutputFormat = resultOutputFormat;
|
||||||
|
this.requestHeaders = requestHeaders != null ? requestHeaders : Maps.newHashMap();
|
||||||
|
|
||||||
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||||
|
: "";
|
||||||
|
this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
|
||||||
: "";
|
: "";
|
||||||
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
||||||
} catch (Exception e) {
|
} catch (final Exception e) {
|
||||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
initQueue();
|
initQueue();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
|
private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
|
||||||
|
final String entityXpath)
|
||||||
throws TransformerConfigurationException, XPathExpressionException {
|
throws TransformerConfigurationException, XPathExpressionException {
|
||||||
final TransformerFactory factory = TransformerFactory.newInstance();
|
final TransformerFactory factory = TransformerFactory.newInstance();
|
||||||
transformer = factory.newTransformer();
|
this.transformer = factory.newTransformer();
|
||||||
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||||
xpath = XPathFactory.newInstance().newXPath();
|
this.xpath = XPathFactory.newInstance().newXPath();
|
||||||
xprResultTotalPath = xpath.compile(resultTotalXpath);
|
this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
|
||||||
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||||
xprEntity = xpath.compile(entityXpath);
|
this.xprEntity = this.xpath.compile(entityXpath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initQueue() {
|
private void initQueue() {
|
||||||
|
if (queryParams.equals("") && querySize.equals("") && queryFormat.equals("")) {
|
||||||
|
query = baseUrl;
|
||||||
|
} else {
|
||||||
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
||||||
log.info("REST calls starting with {}", query);
|
}
|
||||||
|
|
||||||
|
log.info("REST calls starting with {}", this.query);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void disconnect() {
|
private void disconnect() {
|
||||||
|
@ -154,11 +176,22 @@ public class RestIterator implements Iterator<String> {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
if (recordQueue.isEmpty() && query.isEmpty()) {
|
synchronized (this.recordQueue) {
|
||||||
|
while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
|
||||||
|
try {
|
||||||
|
this.query = downloadPage(this.query, 0);
|
||||||
|
} catch (final CollectorException e) {
|
||||||
|
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.recordQueue.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
disconnect();
|
disconnect();
|
||||||
return false;
|
return false;
|
||||||
} else {
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -168,27 +201,34 @@ public class RestIterator implements Iterator<String> {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public String next() {
|
public String next() {
|
||||||
synchronized (recordQueue) {
|
synchronized (this.recordQueue) {
|
||||||
while (recordQueue.isEmpty() && !query.isEmpty()) {
|
return this.recordQueue.poll();
|
||||||
try {
|
|
||||||
query = downloadPage(query);
|
|
||||||
} catch (CollectorException e) {
|
|
||||||
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return recordQueue.poll();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* download page and return nextQuery
|
* download page and return nextQuery (with number of attempt)
|
||||||
*/
|
*/
|
||||||
private String downloadPage(String query) throws CollectorException {
|
private String downloadPage(String query, final int attempt) throws CollectorException {
|
||||||
|
|
||||||
|
if (attempt > MAX_ATTEMPTS) {
|
||||||
|
throw new CollectorException("Max Number of attempts reached, query:" + query);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attempt > 0) {
|
||||||
|
final int delay = (attempt * 5000);
|
||||||
|
log.debug("Attempt {} with delay {}", attempt, delay);
|
||||||
|
try {
|
||||||
|
Thread.sleep(delay);
|
||||||
|
} catch (final InterruptedException e) {
|
||||||
|
new CollectorException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
String resultJson;
|
String resultJson;
|
||||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
String resultXml = XML_HEADER;
|
||||||
String nextQuery = "";
|
String nextQuery = "";
|
||||||
String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
|
||||||
Node resultNode = null;
|
Node resultNode = null;
|
||||||
NodeList nodeList = null;
|
NodeList nodeList = null;
|
||||||
String qUrlArgument = "";
|
String qUrlArgument = "";
|
||||||
|
@ -196,81 +236,96 @@ public class RestIterator implements Iterator<String> {
|
||||||
InputStream theHttpInputStream;
|
InputStream theHttpInputStream;
|
||||||
|
|
||||||
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||||
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
|
if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
|
||||||
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
||||||
if (!query.contains("&cursor=")) {
|
if (!query.contains("&cursor=")) {
|
||||||
query += "&cursor=*";
|
query += "&cursor=*";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// find pagination page start number in queryParam and remove before start the first query
|
||||||
|
if ((resumptionType.toLowerCase().equals("pagination") || resumptionType.toLowerCase().equals("page"))
|
||||||
|
&& (query.contains("paginationStart="))) {
|
||||||
|
|
||||||
|
final Matcher m = Pattern.compile("paginationStart=([0-9]+)").matcher(query);
|
||||||
|
m.find(); // guaranteed to be true for this regex
|
||||||
|
|
||||||
|
String[] pageVal = m.group(0).split("=");
|
||||||
|
pagination = Integer.parseInt(pageVal[1]);
|
||||||
|
|
||||||
|
// remove page start number from query and queryParams
|
||||||
|
queryParams = queryParams.replaceFirst("&?paginationStart=[0-9]+", "");
|
||||||
|
query = query.replaceFirst("&?paginationStart=[0-9]+", "");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
log.info("requestig URL [{}]", query);
|
log.info("requesting URL [{}]", query);
|
||||||
|
|
||||||
URL qUrl = new URL(query);
|
final URL qUrl = new URL(query);
|
||||||
log.debug("authMethod: {}", authMethod);
|
log.debug("authMethod: {}", this.authMethod);
|
||||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
if (this.authMethod == "bearer") {
|
||||||
log.trace("authMethod before inputStream: {}", resultXml);
|
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
|
||||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
requestHeaders.put("Authorization", "Bearer " + authToken);
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
|
// requestHeaders.put("Content-Type", "application/json");
|
||||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
} else if (AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
|
||||||
conn.setRequestMethod("GET");
|
log.trace("RestIterator.downloadPage():: authMethod before inputStream: " + resultXml);
|
||||||
theHttpInputStream = conn.getInputStream();
|
requestHeaders.put("Authorization", "Basic " + authToken);
|
||||||
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
|
// requestHeaders.put("accept", "application/xml");
|
||||||
log.trace("authMethod before inputStream: {}", resultXml);
|
|
||||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
|
||||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
|
|
||||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
|
||||||
conn.setRequestMethod("GET");
|
|
||||||
theHttpInputStream = conn.getInputStream();
|
|
||||||
} else {
|
|
||||||
theHttpInputStream = qUrl.openStream();
|
|
||||||
}
|
}
|
||||||
|
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
this.setRequestHeader(conn);
|
||||||
|
resultStream = conn.getInputStream();
|
||||||
|
|
||||||
resultStream = theHttpInputStream;
|
if ("json".equals(this.resultOutputFormat)) {
|
||||||
if ("json".equals(resultOutputFormat)) {
|
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
|
||||||
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
|
||||||
resultXml = JsonUtils.convertToXML(resultJson);
|
resultXml = JsonUtils.convertToXML(resultJson);
|
||||||
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
if (!isEmptyXml(resultXml)) {
|
||||||
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
|
resultNode = (Node) this.xpath
|
||||||
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
|
||||||
|
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||||
log.debug("nodeList.length: {}", nodeList.getLength());
|
log.debug("nodeList.length: {}", nodeList.getLength());
|
||||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||||
StringWriter sw = new StringWriter();
|
final StringWriter sw = new StringWriter();
|
||||||
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||||
String toEnqueue = sw.toString();
|
final String toEnqueue = sw.toString();
|
||||||
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
|
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue) || isEmptyXml(toEnqueue)) {
|
||||||
log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
|
log
|
||||||
|
.warn(
|
||||||
|
"The following record resulted in empty item for the feeding queue: {}", resultXml);
|
||||||
} else {
|
} else {
|
||||||
recordQueue.add(sw.toString());
|
this.recordQueue.add(sw.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
log.warn("resultXml is equal with emptyXml");
|
log.warn("resultXml is equal with emptyXml");
|
||||||
}
|
}
|
||||||
|
|
||||||
resumptionInt += resultSizeValue;
|
this.resumptionInt += this.resultSizeValue;
|
||||||
|
|
||||||
switch (resumptionType.toLowerCase()) {
|
switch (this.resumptionType.toLowerCase()) {
|
||||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||||
resumptionStr = xprResumptionPath.evaluate(resultNode);
|
this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "count": // begin at one step for all records, iterate over items
|
case "count": // begin at one step for all records, iterate over items
|
||||||
resumptionStr = Integer.toString(resumptionInt);
|
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||||
if (resultSizeValue < 2) {
|
if (this.resultSizeValue < 2) {
|
||||||
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||||
}
|
}
|
||||||
qUrlArgument = qUrl.getQuery();
|
qUrlArgument = qUrl.getQuery();
|
||||||
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
|
||||||
for (String arrayUrlArgStr : arrayQUrlArgument) {
|
final String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||||
if (arrayUrlArgStr.startsWith(resumptionParam)) {
|
for (final String arrayUrlArgStr : arrayQUrlArgument) {
|
||||||
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
|
||||||
|
final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||||
if (isInteger(resumptionKeyValue[1])) {
|
if (isInteger(resumptionKeyValue[1])) {
|
||||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||||
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
||||||
|
@ -280,61 +335,63 @@ public class RestIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (((emptyXml).equalsIgnoreCase(resultXml))
|
if (isEmptyXml(resultXml)
|
||||||
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
|
|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
|
||||||
// resumptionStr = "";
|
// resumptionStr = "";
|
||||||
if (nodeList != null) {
|
if (nodeList != null) {
|
||||||
discoverResultSize += nodeList.getLength();
|
this.discoverResultSize += nodeList.getLength();
|
||||||
}
|
}
|
||||||
resultTotal = discoverResultSize;
|
this.resultTotal = this.discoverResultSize;
|
||||||
} else {
|
} else {
|
||||||
resumptionStr = Integer.toString(resumptionInt);
|
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||||
resultTotal = resumptionInt + 1;
|
this.resultTotal = this.resumptionInt + 1;
|
||||||
if (nodeList != null) {
|
if (nodeList != null) {
|
||||||
discoverResultSize += nodeList.getLength();
|
this.discoverResultSize += nodeList.getLength();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
log.info("discoverResultSize: {}", discoverResultSize);
|
log.info("discoverResultSize: {}", this.discoverResultSize);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "pagination":
|
case "pagination":
|
||||||
case "page": // pagination, iterate over page numbers
|
case "page": // pagination, iterate over page numbers
|
||||||
pagination += 1;
|
if (nodeList != null && nodeList.getLength() > 0) {
|
||||||
if (nodeList != null) {
|
this.discoverResultSize += nodeList.getLength();
|
||||||
discoverResultSize += nodeList.getLength();
|
|
||||||
} else {
|
} else {
|
||||||
resultTotal = discoverResultSize;
|
this.resultTotal = this.discoverResultSize;
|
||||||
pagination = discoverResultSize;
|
this.pagination = this.discoverResultSize;
|
||||||
}
|
}
|
||||||
resumptionInt = pagination;
|
this.pagination += 1;
|
||||||
resumptionStr = Integer.toString(resumptionInt);
|
this.resumptionInt = this.pagination;
|
||||||
|
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
|
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
|
||||||
|
// in
|
||||||
// solr)
|
// solr)
|
||||||
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||||
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||||
|
|
||||||
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
|
this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
|
||||||
queryParams = queryParams.replace("&cursor=*", "");
|
this.queryParams = this.queryParams.replace("&cursor=*", "");
|
||||||
|
|
||||||
// terminating if length of nodeList is 0
|
// terminating if length of nodeList is 0
|
||||||
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
|
if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
|
||||||
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
|
this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
|
||||||
} else {
|
} else {
|
||||||
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
|
this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
|
||||||
|
// resultSizeValue
|
||||||
// because the iteration is over
|
// because the iteration is over
|
||||||
// real length and the
|
// real length and the
|
||||||
// resultSizeValue is added before
|
// resultSizeValue is added before
|
||||||
// the switch()
|
// the switch()
|
||||||
}
|
}
|
||||||
|
|
||||||
discoverResultSize = nodeList.getLength();
|
this.discoverResultSize = nodeList.getLength();
|
||||||
|
|
||||||
log
|
log
|
||||||
.debug(
|
.debug(
|
||||||
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
|
"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
|
||||||
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
|
+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -343,28 +400,30 @@ public class RestIterator implements Iterator<String> {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (final Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (resultTotal == -1) {
|
if (this.resultTotal == -1) {
|
||||||
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
|
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
|
||||||
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
|
if ("page".equalsIgnoreCase(this.resumptionType)
|
||||||
resultTotal += 1;
|
&& !this.AUTHBASIC.equalsIgnoreCase(this.authMethod)) {
|
||||||
|
this.resultTotal += 1;
|
||||||
} // to correct the upper bound
|
} // to correct the upper bound
|
||||||
log.info("resultTotal was -1 is now: " + resultTotal);
|
log.info("resultTotal was -1 is now: " + this.resultTotal);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (final Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||||
}
|
}
|
||||||
log.debug("resultTotal: " + resultTotal);
|
log.debug("resultTotal: " + this.resultTotal);
|
||||||
log.debug("resInt: " + resumptionInt);
|
log.debug("resInt: " + this.resumptionInt);
|
||||||
if (resumptionInt <= resultTotal) {
|
if (this.resumptionInt <= this.resultTotal) {
|
||||||
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
|
nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
|
||||||
+ queryFormat;
|
+ this.resumptionStr
|
||||||
|
+ this.queryFormat;
|
||||||
} else {
|
} else {
|
||||||
nextQuery = "";
|
nextQuery = "";
|
||||||
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||||
|
@ -372,10 +431,18 @@ public class RestIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
log.debug("nextQueryUrl: " + nextQuery);
|
log.debug("nextQueryUrl: " + nextQuery);
|
||||||
return nextQuery;
|
return nextQuery;
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
log.warn(e.getMessage(), e);
|
||||||
|
return downloadPage(query, attempt + 1);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isInteger(String s) {
|
private boolean isEmptyXml(String s) {
|
||||||
|
return EMPTY_XML.equalsIgnoreCase(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isInteger(final String s) {
|
||||||
boolean isValidInteger = false;
|
boolean isValidInteger = false;
|
||||||
try {
|
try {
|
||||||
Integer.parseInt(s);
|
Integer.parseInt(s);
|
||||||
|
@ -383,7 +450,7 @@ public class RestIterator implements Iterator<String> {
|
||||||
// s is a valid integer
|
// s is a valid integer
|
||||||
|
|
||||||
isValidInteger = true;
|
isValidInteger = true;
|
||||||
} catch (NumberFormatException ex) {
|
} catch (final NumberFormatException ex) {
|
||||||
// s is not an integer
|
// s is not an integer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -391,20 +458,36 @@ public class RestIterator implements Iterator<String> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Method to encode a string value using `UTF-8` encoding scheme
|
// Method to encode a string value using `UTF-8` encoding scheme
|
||||||
private String encodeValue(String value) {
|
private String encodeValue(final String value) {
|
||||||
try {
|
try {
|
||||||
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
||||||
} catch (UnsupportedEncodingException ex) {
|
} catch (final UnsupportedEncodingException ex) {
|
||||||
throw new RuntimeException(ex.getCause());
|
throw new RuntimeException(ex.getCause());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* setRequestHeader
|
||||||
|
*
|
||||||
|
* setRequestProperty: Sets the general request property. If a property with the key already exists, overwrite its value with the new value.
|
||||||
|
* @param conn
|
||||||
|
*/
|
||||||
|
private void setRequestHeader(HttpURLConnection conn) {
|
||||||
|
if (requestHeaders != null) {
|
||||||
|
for (String key : requestHeaders.keySet()) {
|
||||||
|
conn.setRequestProperty(key, requestHeaders.get(key));
|
||||||
|
}
|
||||||
|
log.debug("Set Request Header with: " + requestHeaders);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public String getResultFormatValue() {
|
public String getResultFormatValue() {
|
||||||
return resultFormatValue;
|
return this.resultFormatValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getResultOutputFormat() {
|
public String getResultOutputFormat() {
|
||||||
return resultOutputFormat;
|
return this.resultOutputFormat;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,10 @@ import java.io.StringWriter;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import javax.xml.stream.XMLEventFactory;
|
import javax.xml.stream.XMLEventFactory;
|
||||||
import javax.xml.stream.XMLEventReader;
|
import javax.xml.stream.XMLEventReader;
|
||||||
|
@ -19,6 +22,7 @@ import javax.xml.stream.XMLStreamException;
|
||||||
import javax.xml.stream.events.StartElement;
|
import javax.xml.stream.events.StartElement;
|
||||||
import javax.xml.stream.events.XMLEvent;
|
import javax.xml.stream.events.XMLEvent;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
@ -58,13 +62,23 @@ public class XMLIterator implements Iterator<String> {
|
||||||
|
|
||||||
private String element;
|
private String element;
|
||||||
|
|
||||||
|
private List<String> elements;
|
||||||
|
|
||||||
private InputStream inputStream;
|
private InputStream inputStream;
|
||||||
|
|
||||||
public XMLIterator(final String element, final InputStream inputStream) {
|
public XMLIterator(final String element, final InputStream inputStream) {
|
||||||
super();
|
super();
|
||||||
this.element = element;
|
this.element = element;
|
||||||
|
if (element.contains(",")) {
|
||||||
|
elements = Arrays
|
||||||
|
.stream(element.split(","))
|
||||||
|
.filter(StringUtils::isNoneBlank)
|
||||||
|
.map(String::toLowerCase)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
this.inputStream = inputStream;
|
this.inputStream = inputStream;
|
||||||
this.parser = getParser();
|
this.parser = getParser();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
this.current = findElement(parser);
|
this.current = findElement(parser);
|
||||||
} catch (XMLStreamException e) {
|
} catch (XMLStreamException e) {
|
||||||
|
@ -113,7 +127,7 @@ public class XMLIterator implements Iterator<String> {
|
||||||
final XMLEvent event = parser.nextEvent();
|
final XMLEvent event = parser.nextEvent();
|
||||||
|
|
||||||
// TODO: replace with depth tracking instead of close tag tracking.
|
// TODO: replace with depth tracking instead of close tag tracking.
|
||||||
if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
|
if (event.isEndElement() && isCheckTag(event.asEndElement().getName().getLocalPart())) {
|
||||||
writer.add(event);
|
writer.add(event);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -142,31 +156,48 @@ public class XMLIterator implements Iterator<String> {
|
||||||
XMLEvent peek = parser.peek();
|
XMLEvent peek = parser.peek();
|
||||||
if (peek != null && peek.isStartElement()) {
|
if (peek != null && peek.isStartElement()) {
|
||||||
String name = peek.asStartElement().getName().getLocalPart();
|
String name = peek.asStartElement().getName().getLocalPart();
|
||||||
if (element.equals(name)) {
|
if (isCheckTag(name))
|
||||||
return peek;
|
return peek;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
while (parser.hasNext()) {
|
while (parser.hasNext()) {
|
||||||
final XMLEvent event = parser.nextEvent();
|
XMLEvent event = parser.nextEvent();
|
||||||
if (event != null && event.isStartElement()) {
|
if (event != null && event.isStartElement()) {
|
||||||
String name = event.asStartElement().getName().getLocalPart();
|
String name = event.asStartElement().getName().getLocalPart();
|
||||||
if (element.equals(name)) {
|
if (isCheckTag(name))
|
||||||
return event;
|
return event;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private XMLEventReader getParser() {
|
private XMLEventReader getParser() {
|
||||||
try {
|
try {
|
||||||
return inputFactory.get().createXMLEventReader(sanitize(inputStream));
|
XMLInputFactory xif = inputFactory.get();
|
||||||
|
xif.setProperty(XMLInputFactory.SUPPORT_DTD, false);
|
||||||
|
return xif.createXMLEventReader(sanitize(inputStream));
|
||||||
} catch (XMLStreamException e) {
|
} catch (XMLStreamException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isCheckTag(final String tagName) {
|
||||||
|
if (elements != null) {
|
||||||
|
final String found = elements
|
||||||
|
.stream()
|
||||||
|
.filter(e -> e.equalsIgnoreCase(tagName))
|
||||||
|
.findFirst()
|
||||||
|
.orElse(null);
|
||||||
|
if (found != null)
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
if (element.equalsIgnoreCase(tagName)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private Reader sanitize(final InputStream in) {
|
private Reader sanitize(final InputStream in) {
|
||||||
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
|
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
|
||||||
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
|
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
|
||||||
|
|
|
@ -16,5 +16,10 @@
|
||||||
"paramLongName": "isSparkSessionManaged",
|
"paramLongName": "isSparkSessionManaged",
|
||||||
"paramDescription": "the hdfs name node",
|
"paramDescription": "the hdfs name node",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
}
|
},{
|
||||||
|
"paramName": "bl",
|
||||||
|
"paramLongName": "blackListPath",
|
||||||
|
"paramDescription": "the working path",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
sourcePath=/user/miriam.baglioni/openalex-snapshot/data/works/
|
||||||
outputPath=/tmp/miriam/webcrawlComplete/
|
outputPath=/tmp/miriam/webcrawlComplete/
|
||||||
|
blackListPath=/user/miriam.baglioni/openalex-blackList
|
||||||
|
|
|
@ -45,6 +45,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
|
<arg>--blackListPath</arg><arg>${blackListPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -1,10 +1,5 @@
|
||||||
[
|
[
|
||||||
{
|
|
||||||
"id": "100007630",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/100007630",
|
|
||||||
"name": "College of Engineering and Informatics, National University of Ireland, Galway",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "100007731",
|
"id": "100007731",
|
||||||
"uri": "http://dx.doi.org/10.13039/100007731",
|
"uri": "http://dx.doi.org/10.13039/100007731",
|
||||||
|
@ -58,7 +53,7 @@
|
||||||
"uri": "http://dx.doi.org/10.13039/100010414",
|
"uri": "http://dx.doi.org/10.13039/100010414",
|
||||||
"name": "Health Research Board",
|
"name": "Health Research Board",
|
||||||
"synonym": [
|
"synonym": [
|
||||||
"501100001590"
|
"501100001590", "501100023273"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -85,24 +80,6 @@
|
||||||
"name": "Irish College of General Practitioners",
|
"name": "Irish College of General Practitioners",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "100012734",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/100012734",
|
|
||||||
"name": "Department for Culture, Heritage and the Gaeltacht, Ireland",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "100012754",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/100012754",
|
|
||||||
"name": "Horizon Pharma",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "100012891",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/100012891",
|
|
||||||
"name": "Medical Research Charities Group",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "100012919",
|
"id": "100012919",
|
||||||
"uri": "http://dx.doi.org/10.13039/100012919",
|
"uri": "http://dx.doi.org/10.13039/100012919",
|
||||||
|
@ -233,7 +210,7 @@
|
||||||
"id": "100018064",
|
"id": "100018064",
|
||||||
"uri": "http://dx.doi.org/10.13039/100018064",
|
"uri": "http://dx.doi.org/10.13039/100018064",
|
||||||
"name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
|
"name": "Department of Tourism, Culture, Arts, Gaeltacht, Sport and Media",
|
||||||
"synonym": []
|
"synonym": ["100012734"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "100018172",
|
"id": "100018172",
|
||||||
|
@ -281,13 +258,13 @@
|
||||||
"id": "100019637",
|
"id": "100019637",
|
||||||
"uri": "http://dx.doi.org/10.13039/100019637",
|
"uri": "http://dx.doi.org/10.13039/100019637",
|
||||||
"name": "Horizon Therapeutics",
|
"name": "Horizon Therapeutics",
|
||||||
"synonym": []
|
"synonym": ["100012754"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "100020174",
|
"id": "100020174",
|
||||||
"uri": "http://dx.doi.org/10.13039/100020174",
|
"uri": "http://dx.doi.org/10.13039/100020174",
|
||||||
"name": "Health Research Charities Ireland",
|
"name": "Health Research Charities Ireland",
|
||||||
"synonym": []
|
"synonym": ["100012891"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "100020202",
|
"id": "100020202",
|
||||||
|
@ -319,12 +296,7 @@
|
||||||
"name": "Centre for Ageing Research and Development in Ireland",
|
"name": "Centre for Ageing Research and Development in Ireland",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100001583",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001583",
|
|
||||||
"name": "Cystinosis Foundation Ireland",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100001584",
|
"id": "501100001584",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001584",
|
"uri": "http://dx.doi.org/10.13039/501100001584",
|
||||||
|
@ -455,13 +427,13 @@
|
||||||
"id": "501100001634",
|
"id": "501100001634",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001634",
|
"uri": "http://dx.doi.org/10.13039/501100001634",
|
||||||
"name": "University of Galway",
|
"name": "University of Galway",
|
||||||
"synonym": []
|
"synonym": ["501100019905", "100007630", "501100020570", "501100023852"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "501100001635",
|
"id": "501100001635",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100001635",
|
"uri": "http://dx.doi.org/10.13039/501100001635",
|
||||||
"name": "University of Limerick",
|
"name": "University of Limerick",
|
||||||
"synonym": []
|
"synonym": ["501100014531"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "501100001636",
|
"id": "501100001636",
|
||||||
|
@ -491,7 +463,7 @@
|
||||||
"id": "501100002736",
|
"id": "501100002736",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100002736",
|
"uri": "http://dx.doi.org/10.13039/501100002736",
|
||||||
"name": "Covidien",
|
"name": "Covidien",
|
||||||
"synonym": []
|
"synonym": ["501100003956"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "501100002755",
|
"id": "501100002755",
|
||||||
|
@ -521,7 +493,7 @@
|
||||||
"id": "501100003037",
|
"id": "501100003037",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100003037",
|
"uri": "http://dx.doi.org/10.13039/501100003037",
|
||||||
"name": "Elan",
|
"name": "Elan",
|
||||||
"synonym": []
|
"synonym": ["501100021694"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "501100003496",
|
"id": "501100003496",
|
||||||
|
@ -541,12 +513,6 @@
|
||||||
"name": "Irish Institute of Clinical Neuroscience",
|
"name": "Irish Institute of Clinical Neuroscience",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100003956",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100003956",
|
|
||||||
"name": "Aspect Medical Systems",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100004162",
|
"id": "501100004162",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100004162",
|
"uri": "http://dx.doi.org/10.13039/501100004162",
|
||||||
|
@ -595,17 +561,11 @@
|
||||||
"name": "Technological University Dublin",
|
"name": "Technological University Dublin",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100009269",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100009269",
|
|
||||||
"name": "Programme of Competitive Forestry Research for Development",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100009315",
|
"id": "501100009315",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100009315",
|
"uri": "http://dx.doi.org/10.13039/501100009315",
|
||||||
"name": "Cystinosis Ireland",
|
"name": "Cystinosis Ireland",
|
||||||
"synonym": []
|
"synonym": ["501100001583"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "501100010808",
|
"id": "501100010808",
|
||||||
|
@ -625,12 +585,6 @@
|
||||||
"name": "Alimentary Health",
|
"name": "Alimentary Health",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100011103",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100011103",
|
|
||||||
"name": "Rann\u00eds",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100012354",
|
"id": "501100012354",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100012354",
|
"uri": "http://dx.doi.org/10.13039/501100012354",
|
||||||
|
@ -679,12 +633,7 @@
|
||||||
"name": "Irish Centre for High-End Computing",
|
"name": "Irish Centre for High-End Computing",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100019905",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100019905",
|
|
||||||
"name": "Galway University Foundation",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100020036",
|
"id": "501100020036",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100020036",
|
"uri": "http://dx.doi.org/10.13039/501100020036",
|
||||||
|
@ -733,12 +682,6 @@
|
||||||
"name": "Insight SFI Research Centre for Data Analytics",
|
"name": "Insight SFI Research Centre for Data Analytics",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100021694",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100021694",
|
|
||||||
"name": "Elan Pharma International",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100021838",
|
"id": "501100021838",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100021838",
|
"uri": "http://dx.doi.org/10.13039/501100021838",
|
||||||
|
@ -769,12 +712,6 @@
|
||||||
"name": "Institute of Technology, Tralee",
|
"name": "Institute of Technology, Tralee",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100023273",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100023273",
|
|
||||||
"name": "HRB Clinical Research Facility Galway",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100023378",
|
"id": "501100023378",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100023378",
|
"uri": "http://dx.doi.org/10.13039/501100023378",
|
||||||
|
@ -871,12 +808,7 @@
|
||||||
"name": "Energy Policy Research Centre, Economic and Social Research Institute",
|
"name": "Energy Policy Research Centre, Economic and Social Research Institute",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100014531",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100014531",
|
|
||||||
"name": "Physical Education and Sport Sciences Department, University of Limerick",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100014745",
|
"id": "501100014745",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100014745",
|
"uri": "http://dx.doi.org/10.13039/501100014745",
|
||||||
|
@ -889,22 +821,11 @@
|
||||||
"name": "ADAPT - Centre for Digital Content Technology",
|
"name": "ADAPT - Centre for Digital Content Technology",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"id": "501100020570",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100020570",
|
|
||||||
"name": "College of Medicine, Nursing and Health Sciences, National University of Ireland, Galway",
|
|
||||||
"synonym": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"id": "501100020871",
|
"id": "501100020871",
|
||||||
"uri": "http://dx.doi.org/10.13039/501100020871",
|
"uri": "http://dx.doi.org/10.13039/501100020871",
|
||||||
"name": "Bernal Institute, University of Limerick",
|
"name": "Bernal Institute, University of Limerick",
|
||||||
"synonym": []
|
"synonym": []
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "501100023852",
|
|
||||||
"uri": "http://dx.doi.org/10.13039/501100023852",
|
|
||||||
"name": "Moore Institute for Research in the Humanities and Social Studies, University of Galway",
|
|
||||||
"synonym": []
|
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -48,12 +48,37 @@
|
||||||
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
|
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>JAVA_HOME</name>
|
||||||
|
<value>/srv/java/openjdk-17</value>
|
||||||
|
<description>Used to configure the Java home location for oozie.launcher.mapreduce.map.env</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>JAVA_OPTS</name>
|
||||||
|
<value>-Dcom.sun.security.enableAIAcaIssuers=true</value>
|
||||||
|
<description>Used to configure the JAVA_OPTS parameter</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<global>
|
<global>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.env</name>
|
||||||
|
<value>JAVA_HOME=${JAVA_HOME}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="collection_mode"/>
|
<start to="collection_mode"/>
|
||||||
|
@ -99,7 +124,7 @@
|
||||||
<action name="CollectionWorker">
|
<action name="CollectionWorker">
|
||||||
<java>
|
<java>
|
||||||
<main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
|
<main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
|
||||||
<java-opts>${collection_java_xmx}</java-opts>
|
<java-opts>${JAVA_OPTS} ${collection_java_xmx}</java-opts>
|
||||||
<arg>--apidescriptor</arg><arg>${apiDescription}</arg>
|
<arg>--apidescriptor</arg><arg>${apiDescription}</arg>
|
||||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
<arg>--workflowId</arg><arg>${workflowId}</arg>
|
<arg>--workflowId</arg><arg>${workflowId}</arg>
|
||||||
|
|
|
@ -1025,6 +1025,7 @@ case object Crossref2Oaf {
|
||||||
tp._1 match {
|
tp._1 match {
|
||||||
case "electronic" => journal.setIssnOnline(tp._2)
|
case "electronic" => journal.setIssnOnline(tp._2)
|
||||||
case "print" => journal.setIssnPrinted(tp._2)
|
case "print" => journal.setIssnPrinted(tp._2)
|
||||||
|
case _ =>
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,23 +79,6 @@ object MagUtility extends Serializable {
|
||||||
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
|
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
|
||||||
|
|
||||||
private val MAGDataInfo: DataInfo = {
|
private val MAGDataInfo: DataInfo = {
|
||||||
val di = new DataInfo
|
|
||||||
di.setDeletedbyinference(false)
|
|
||||||
di.setInferred(false)
|
|
||||||
di.setInvisible(false)
|
|
||||||
di.setTrust("0.9")
|
|
||||||
di.setProvenanceaction(
|
|
||||||
OafMapperUtils.qualifier(
|
|
||||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
||||||
ModelConstants.SYSIMPORT_ACTIONSET,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS
|
|
||||||
)
|
|
||||||
)
|
|
||||||
di
|
|
||||||
}
|
|
||||||
|
|
||||||
private val MAGDataInfoInvisible: DataInfo = {
|
|
||||||
val di = new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference(false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred(false)
|
di.setInferred(false)
|
||||||
|
@ -453,7 +436,6 @@ object MagUtility extends Serializable {
|
||||||
|
|
||||||
case "repository" =>
|
case "repository" =>
|
||||||
result = new Publication()
|
result = new Publication()
|
||||||
result.setDataInfo(MAGDataInfoInvisible)
|
|
||||||
qualifier(
|
qualifier(
|
||||||
"0038",
|
"0038",
|
||||||
"Other literature type",
|
"Other literature type",
|
||||||
|
@ -488,7 +470,6 @@ object MagUtility extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
if (result.getDataInfo == null)
|
|
||||||
result.setDataInfo(MAGDataInfo)
|
result.setDataInfo(MAGDataInfo)
|
||||||
val i = new Instance
|
val i = new Instance
|
||||||
i.setInstancetype(tp)
|
i.setInstancetype(tp)
|
||||||
|
@ -512,7 +493,7 @@ object MagUtility extends Serializable {
|
||||||
return null
|
return null
|
||||||
|
|
||||||
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
result.setCollectedfrom(List(MAGCollectedFrom).asJava)
|
||||||
val pidList = List(
|
var pidList = List(
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
paper.paperId.get.toString,
|
paper.paperId.get.toString,
|
||||||
qualifier(
|
qualifier(
|
||||||
|
@ -525,8 +506,6 @@ object MagUtility extends Serializable {
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
result.setPid(pidList.asJava)
|
|
||||||
|
|
||||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||||
|
|
||||||
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
|
||||||
|
@ -618,10 +597,9 @@ object MagUtility extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
val instance = result.getInstance().get(0)
|
val instance = result.getInstance().get(0)
|
||||||
instance.setPid(pidList.asJava)
|
|
||||||
if (paper.doi.orNull != null)
|
if (paper.doi.orNull != null) {
|
||||||
instance.setAlternateIdentifier(
|
pidList = pidList ::: List(
|
||||||
List(
|
|
||||||
structuredProperty(
|
structuredProperty(
|
||||||
paper.doi.get,
|
paper.doi.get,
|
||||||
qualifier(
|
qualifier(
|
||||||
|
@ -632,8 +610,10 @@ object MagUtility extends Serializable {
|
||||||
),
|
),
|
||||||
null
|
null
|
||||||
)
|
)
|
||||||
).asJava
|
|
||||||
)
|
)
|
||||||
|
}
|
||||||
|
instance.setPid(pidList.asJava)
|
||||||
|
result.setPid(pidList.asJava)
|
||||||
instance.setUrl(paper.urls.get.asJava)
|
instance.setUrl(paper.urls.get.asJava)
|
||||||
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
|
||||||
instance.setCollectedfrom(MAGCollectedFrom)
|
instance.setCollectedfrom(MAGCollectedFrom)
|
||||||
|
|
|
@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
|
||||||
spark.read
|
spark.read
|
||||||
.load(s"$magBasePath/mag_denormalized")
|
.load(s"$magBasePath/mag_denormalized")
|
||||||
.as[MAGPaper]
|
.as[MAGPaper]
|
||||||
|
.filter(col("doi").isNotNull)
|
||||||
.map(s => MagUtility.convertMAGtoOAF(s))
|
.map(s => MagUtility.convertMAGtoOAF(s))
|
||||||
.filter(s => s != null)
|
.filter(s => s != null)
|
||||||
.write
|
.write
|
||||||
|
|
|
@ -2,12 +2,9 @@ package eu.dnetlib.dhp.sx.bio.ebi
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
|
||||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils.{MAPPER, writeHdfsFile}
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
@ -17,13 +14,13 @@ import org.apache.http.client.methods.HttpGet
|
||||||
import org.apache.http.impl.client.HttpClientBuilder
|
import org.apache.http.impl.client.HttpClientBuilder
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
|
||||||
import org.apache.spark.sql._
|
import org.apache.spark.sql._
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import java.io.InputStream
|
import java.io.{ByteArrayInputStream, InputStream}
|
||||||
import scala.io.Source
|
import java.nio.charset.Charset
|
||||||
import scala.xml.pull.XMLEventReader
|
import javax.xml.stream.XMLInputFactory
|
||||||
|
|
||||||
object SparkCreateBaselineDataFrame {
|
object SparkCreateBaselineDataFrame {
|
||||||
|
|
||||||
|
@ -86,7 +83,7 @@ object SparkCreateBaselineDataFrame {
|
||||||
if (response.getStatusLine.getStatusCode > 400) {
|
if (response.getStatusLine.getStatusCode > 400) {
|
||||||
tries -= 1
|
tries -= 1
|
||||||
} else
|
} else
|
||||||
return IOUtils.toString(response.getEntity.getContent)
|
return IOUtils.toString(response.getEntity.getContent, Charset.defaultCharset())
|
||||||
} catch {
|
} catch {
|
||||||
case e: Throwable =>
|
case e: Throwable =>
|
||||||
println(s"Error on requesting ${r.getURI}")
|
println(s"Error on requesting ${r.getURI}")
|
||||||
|
@ -158,7 +155,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
IOUtils.toString(
|
IOUtils.toString(
|
||||||
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||||
)
|
),
|
||||||
|
Charset.defaultCharset()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
|
@ -167,15 +165,11 @@ object SparkCreateBaselineDataFrame {
|
||||||
val workingPath = parser.get("workingPath")
|
val workingPath = parser.get("workingPath")
|
||||||
log.info("workingPath: {}", workingPath)
|
log.info("workingPath: {}", workingPath)
|
||||||
|
|
||||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
val targetPath = parser.get("targetPath")
|
||||||
log.info("mdstoreOutputVersion: {}", mdstoreOutputVersion)
|
log.info("targetPath: {}", targetPath)
|
||||||
|
|
||||||
val cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
|
||||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
|
||||||
log.info("outputBasePath: {}", outputBasePath)
|
|
||||||
|
|
||||||
val hdfsServerUri = parser.get("hdfsServerUri")
|
val hdfsServerUri = parser.get("hdfsServerUri")
|
||||||
log.info("hdfsServerUri: {}", hdfsServerUri)
|
log.info("hdfsServerUri: {}", targetPath)
|
||||||
|
|
||||||
val skipUpdate = parser.get("skipUpdate")
|
val skipUpdate = parser.get("skipUpdate")
|
||||||
log.info("skipUpdate: {}", skipUpdate)
|
log.info("skipUpdate: {}", skipUpdate)
|
||||||
|
@ -201,10 +195,11 @@ object SparkCreateBaselineDataFrame {
|
||||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||||
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
val ds: Dataset[PMArticle] = spark.createDataset(
|
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||||
k.filter(i => i._1.endsWith(".gz"))
|
k.filter(i => i._1.endsWith(".gz"))
|
||||||
.flatMap(i => {
|
.flatMap(i => {
|
||||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
val xml = inputFactory.createXMLEventReader(new ByteArrayInputStream(i._2.getBytes()))
|
||||||
new PMParser(xml)
|
new PMParser(xml)
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
@ -223,11 +218,8 @@ object SparkCreateBaselineDataFrame {
|
||||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||||
.as[Oaf]
|
.as[Oaf]
|
||||||
.filter(p => p != null),
|
.filter(p => p != null),
|
||||||
s"$outputBasePath/$MDSTORE_DATA_PATH"
|
targetPath
|
||||||
)
|
)
|
||||||
|
|
||||||
val df = spark.read.text(s"$outputBasePath/$MDSTORE_DATA_PATH")
|
|
||||||
val mdStoreSize = df.count
|
|
||||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$mdStoreSize", s"$outputBasePath/$MDSTORE_SIZE_PATH")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||||
|
|
||||||
import scala.xml.MetaData
|
import scala.xml.MetaData
|
||||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
import javax.xml.stream.XMLEventReader
|
||||||
|
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
|
||||||
|
|
||||||
/** @param xml
|
/** @param xml
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -15,10 +15,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.AfterAll;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,9 @@ public class ReadCOCITest {
|
||||||
workingDir.toString() + "/COCI",
|
workingDir.toString() + "/COCI",
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/COCI_json/",
|
workingDir.toString() + "/COCI_json/",
|
||||||
"-inputFile", "input1;input2;input3;input4;input5"
|
"-inputFile", "input1;input2;input3;input4;input5",
|
||||||
|
"-format",
|
||||||
|
"COCI"
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
|
@ -75,7 +75,11 @@ public class CreateASTest {
|
||||||
|
|
||||||
String inputPath = getClass()
|
String inputPath = getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/input/")
|
||||||
|
.getPath();
|
||||||
|
String blackListPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
CreateActionSetFromWebEntries
|
CreateActionSetFromWebEntries
|
||||||
|
@ -86,7 +90,8 @@ public class CreateASTest {
|
||||||
"-sourcePath",
|
"-sourcePath",
|
||||||
inputPath,
|
inputPath,
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/actionSet1"
|
workingDir.toString() + "/actionSet1",
|
||||||
|
"-blackListPath", blackListPath
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
@ -96,7 +101,7 @@ public class CreateASTest {
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
.map(aa -> ((Relation) aa.getPayload()));
|
.map(aa -> ((Relation) aa.getPayload()));
|
||||||
|
|
||||||
Assertions.assertEquals(64, tmp.count());
|
Assertions.assertEquals(58, tmp.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -109,6 +114,10 @@ public class CreateASTest {
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
String blackListPath = getClass()
|
||||||
|
.getResource(
|
||||||
|
"/eu/dnetlib/dhp/actionmanager/webcrawl/blackList/")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
CreateActionSetFromWebEntries
|
CreateActionSetFromWebEntries
|
||||||
.main(
|
.main(
|
||||||
|
@ -118,7 +127,8 @@ public class CreateASTest {
|
||||||
"-sourcePath",
|
"-sourcePath",
|
||||||
inputPath,
|
inputPath,
|
||||||
"-outputPath",
|
"-outputPath",
|
||||||
workingDir.toString() + "/actionSet1"
|
workingDir.toString() + "/actionSet1",
|
||||||
|
"-blackListPath", blackListPath
|
||||||
});
|
});
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
@ -184,7 +194,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
5, tmp
|
2, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getSource()
|
.getSource()
|
||||||
|
@ -197,7 +207,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
5, tmp
|
2, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -210,7 +220,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2, tmp
|
1, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -224,7 +234,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
2, tmp
|
1, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
@ -238,7 +248,7 @@ public class CreateASTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
1, tmp
|
0, tmp
|
||||||
.filter(
|
.filter(
|
||||||
r -> r
|
r -> r
|
||||||
.getTarget()
|
.getTarget()
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.collection.plugin.file;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
|
||||||
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
public class FileGZipMultipleNodeTest {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
|
||||||
|
|
||||||
|
private final ApiDescriptor api = new ApiDescriptor();
|
||||||
|
|
||||||
|
private FileGZipCollectorPlugin plugin;
|
||||||
|
|
||||||
|
private static final String SPLIT_ON_ELEMENT = "incollection,article";
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
|
||||||
|
final String gzipFile = Objects
|
||||||
|
.requireNonNull(
|
||||||
|
this
|
||||||
|
.getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/collection/plugin/file/dblp.gz"))
|
||||||
|
.getFile();
|
||||||
|
|
||||||
|
api.setBaseUrl(gzipFile);
|
||||||
|
|
||||||
|
HashMap<String, String> params = new HashMap<>();
|
||||||
|
params.put("splitOnElement", SPLIT_ON_ELEMENT);
|
||||||
|
|
||||||
|
api.setParams(params);
|
||||||
|
|
||||||
|
FileSystem fs = FileSystem.get(new Configuration());
|
||||||
|
plugin = new FileGZipCollectorPlugin(fs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void test() throws CollectorException {
|
||||||
|
|
||||||
|
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
|
||||||
|
|
||||||
|
stream.limit(10).forEach(s -> {
|
||||||
|
Assertions.assertTrue(s.length() > 0);
|
||||||
|
log.info(s);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
@ -35,11 +36,11 @@ public class OsfPreprintCollectorTest {
|
||||||
private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
|
private final String resultTotalXpath = "/*/*[local-name()='links']/*[local-name()='meta']/*[local-name()='total']";
|
||||||
|
|
||||||
private final String resumptionParam = "page";
|
private final String resumptionParam = "page";
|
||||||
private final String resumptionType = "page";
|
private final String resumptionType = "scan";
|
||||||
private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";
|
private final String resumptionXpath = "substring-before(substring-after(/*/*[local-name()='links']/*[local-name()='next'], 'page='), '&')";
|
||||||
|
|
||||||
private final String resultSizeParam = "";
|
private final String resultSizeParam = "page[size]";
|
||||||
private final String resultSizeValue = "";
|
private final String resultSizeValue = "100";
|
||||||
|
|
||||||
private final String resultFormatParam = "format";
|
private final String resultFormatParam = "format";
|
||||||
private final String resultFormatValue = "json";
|
private final String resultFormatValue = "json";
|
||||||
|
@ -69,11 +70,11 @@ public class OsfPreprintCollectorTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
void test() throws CollectorException {
|
void test_limited() throws CollectorException {
|
||||||
final AtomicInteger i = new AtomicInteger(0);
|
final AtomicInteger i = new AtomicInteger(0);
|
||||||
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||||
|
|
||||||
stream.limit(200).forEach(s -> {
|
stream.limit(2000).forEach(s -> {
|
||||||
Assertions.assertTrue(s.length() > 0);
|
Assertions.assertTrue(s.length() > 0);
|
||||||
i.incrementAndGet();
|
i.incrementAndGet();
|
||||||
log.info(s);
|
log.info(s);
|
||||||
|
@ -82,4 +83,23 @@ public class OsfPreprintCollectorTest {
|
||||||
log.info("{}", i.intValue());
|
log.info("{}", i.intValue());
|
||||||
Assertions.assertTrue(i.intValue() > 0);
|
Assertions.assertTrue(i.intValue() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
void test_all() throws CollectorException {
|
||||||
|
final AtomicLong i = new AtomicLong(0);
|
||||||
|
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||||
|
|
||||||
|
stream.forEach(s -> {
|
||||||
|
Assertions.assertTrue(s.length() > 0);
|
||||||
|
if ((i.incrementAndGet() % 1000) == 0) {
|
||||||
|
log.info("COLLECTED: {}", i.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
log.info("TOTAL: {}", i.get());
|
||||||
|
Assertions.assertTrue(i.get() > 0);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
@ -12,6 +17,8 @@ import org.junit.jupiter.api.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
|
@ -25,18 +32,18 @@ class RestCollectorPluginTest {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
|
private static final Logger log = LoggerFactory.getLogger(RestCollectorPluginTest.class);
|
||||||
|
|
||||||
private final String baseUrl = "https://share.osf.io/api/v2/search/creativeworks/_search";
|
private final String baseUrl = "https://ddh-openapi.worldbank.org/search";
|
||||||
private final String resumptionType = "count";
|
private final String resumptionType = "discover";
|
||||||
private final String resumptionParam = "from";
|
private final String resumptionParam = "skip";
|
||||||
private final String entityXpath = "//hits/hits";
|
private final String entityXpath = "//*[local-name()='data']";
|
||||||
private final String resumptionXpath = "//hits";
|
private final String resumptionXpath = "";
|
||||||
private final String resultTotalXpath = "//hits/total";
|
private final String resultTotalXpath = "//*[local-name()='count']";
|
||||||
private final String resultFormatParam = "format";
|
private final String resultFormatParam = "";
|
||||||
private final String resultFormatValue = "json";
|
private final String resultFormatValue = "json";
|
||||||
private final String resultSizeParam = "size";
|
private final String resultSizeParam = "top";
|
||||||
private final String resultSizeValue = "10";
|
private final String resultSizeValue = "10";
|
||||||
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
// private String query = "q=%28sources%3ASocArXiv+AND+type%3Apreprint%29";
|
||||||
private final String query = "q=%28sources%3AengrXiv+AND+type%3Apreprint%29";
|
private final String query = "";
|
||||||
// private String query = "=(sources:engrXiv AND type:preprint)";
|
// private String query = "=(sources:engrXiv AND type:preprint)";
|
||||||
|
|
||||||
private final String protocolDescriptor = "rest_json2xml";
|
private final String protocolDescriptor = "rest_json2xml";
|
||||||
|
@ -56,6 +63,7 @@ class RestCollectorPluginTest {
|
||||||
params.put("resultSizeValue", resultSizeValue);
|
params.put("resultSizeValue", resultSizeValue);
|
||||||
params.put("queryParams", query);
|
params.put("queryParams", query);
|
||||||
params.put("entityXpath", entityXpath);
|
params.put("entityXpath", entityXpath);
|
||||||
|
params.put("requestHeaderMap", "{\"User-Agent\": \"OpenAIRE DEV\"}");
|
||||||
|
|
||||||
api.setBaseUrl(baseUrl);
|
api.setBaseUrl(baseUrl);
|
||||||
api.setParams(params);
|
api.setParams(params);
|
||||||
|
@ -78,4 +86,19 @@ class RestCollectorPluginTest {
|
||||||
log.info("{}", i.intValue());
|
log.info("{}", i.intValue());
|
||||||
Assertions.assertTrue(i.intValue() > 0);
|
Assertions.assertTrue(i.intValue() > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Disabled
|
||||||
|
@Test
|
||||||
|
void testUrl() throws IOException {
|
||||||
|
String url_s = "https://ddh-openapi.worldbank.org/search?&top=10";
|
||||||
|
URL url = new URL(url_s);
|
||||||
|
final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
|
||||||
|
conn.setRequestMethod("GET");
|
||||||
|
conn.setRequestProperty("User-Agent", "OpenAIRE");
|
||||||
|
Gson gson = new Gson();
|
||||||
|
System.out.println("Request header");
|
||||||
|
System.out.println(gson.toJson(conn.getHeaderFields()));
|
||||||
|
InputStream inputStream = conn.getInputStream();
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class RestIteratorTest {
|
||||||
|
|
||||||
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
|
final RestIterator iterator = new RestIterator(clientParams, baseUrl, resumptionType, resumptionParam,
|
||||||
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
|
resumptionXpath, resultTotalXpath, resultFormatParam, resultFormatValue, resultSizeParam, resultSizeValue,
|
||||||
query, entityXpath, authMethod, authToken, resultOffsetParam);
|
query, entityXpath, authMethod, authToken, resultOffsetParam, null);
|
||||||
int i = 20;
|
int i = 20;
|
||||||
while (iterator.hasNext() && i > 0) {
|
while (iterator.hasNext() && i > 0) {
|
||||||
String result = iterator.next();
|
String result = iterator.next();
|
||||||
|
|
|
@ -789,10 +789,6 @@
|
||||||
"value": "2227-9717",
|
"value": "2227-9717",
|
||||||
"type": "electronic"
|
"type": "electronic"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"value": "VALUE",
|
|
||||||
"type": "PIPPO"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"value": "1063-4584",
|
"value": "1063-4584",
|
||||||
"type": "pu"
|
"type": "pu"
|
||||||
|
|
Binary file not shown.
|
@ -2,7 +2,9 @@ package eu.dnetlib.dhp.collection.crossref
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||||
import org.junit.jupiter.api.BeforeEach
|
import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType
|
||||||
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||||
import org.junit.jupiter.api.extension.ExtendWith
|
import org.junit.jupiter.api.extension.ExtendWith
|
||||||
import org.mockito.junit.jupiter.MockitoExtension
|
import org.mockito.junit.jupiter.MockitoExtension
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
@ -18,4 +20,13 @@ class CrossrefMappingTest extends AbstractVocabularyTest {
|
||||||
super.setUpVocabulary()
|
super.setUpVocabulary()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def mappingRecord(): Unit = {
|
||||||
|
val input =
|
||||||
|
IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8")
|
||||||
|
|
||||||
|
println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All))
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
|
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
|
||||||
import org.apache.spark.sql.SparkSession
|
import org.apache.spark.sql.SparkSession
|
||||||
|
import org.apache.spark.sql.functions.col
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
|
@ -18,10 +19,8 @@ class MAGMappingTest {
|
||||||
.master("local[*]")
|
.master("local[*]")
|
||||||
.getOrCreate()
|
.getOrCreate()
|
||||||
|
|
||||||
val s = new SparkMagOrganizationAS(null, null, null)
|
val s = new SparkMAGtoOAF(null, null, null)
|
||||||
|
s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
|
||||||
s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -16,6 +16,7 @@ import org.mockito.junit.jupiter.MockitoExtension
|
||||||
|
|
||||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||||
import java.util.zip.GZIPInputStream
|
import java.util.zip.GZIPInputStream
|
||||||
|
import javax.xml.stream.XMLInputFactory
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.collection.mutable.ListBuffer
|
import scala.collection.mutable.ListBuffer
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
@ -49,10 +50,8 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testEBIData() = {
|
def testEBIData() = {
|
||||||
val inputXML = Source
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
.mkString
|
|
||||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
|
||||||
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
new PMParser(xml).foreach(s => println(mapper.writeValueAsString(s)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,9 +90,10 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testParsingPubmedXML(): Unit = {
|
def testParsingPubmedXML(): Unit = {
|
||||||
val xml = new XMLEventReader(
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
|
||||||
)
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
|
|
||||||
val parser = new PMParser(xml)
|
val parser = new PMParser(xml)
|
||||||
parser.foreach(checkPMArticle)
|
parser.foreach(checkPMArticle)
|
||||||
}
|
}
|
||||||
|
@ -156,9 +156,9 @@ class BioScholixTest extends AbstractVocabularyTest {
|
||||||
@Test
|
@Test
|
||||||
def testPubmedMapping(): Unit = {
|
def testPubmedMapping(): Unit = {
|
||||||
|
|
||||||
val xml = new XMLEventReader(
|
val inputFactory = XMLInputFactory.newInstance
|
||||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
val xml = inputFactory.createXMLEventReader(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml"))
|
||||||
)
|
|
||||||
val parser = new PMParser(xml)
|
val parser = new PMParser(xml)
|
||||||
val results = ListBuffer[Oaf]()
|
val results = ListBuffer[Oaf]()
|
||||||
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
|
parser.foreach(x => results += PubMedToOaf.convert(x, vocabularies))
|
||||||
|
|
|
@ -26,15 +26,15 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
|
||||||
public class PrepareSimpleEntititiesJob {
|
public class PrepareSimpleEntitiesJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntitiesJob.class);
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
PrepareSimpleEntititiesJob.class
|
PrepareSimpleEntitiesJob.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
|
@ -160,8 +160,7 @@ public class ConversionUtils {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(pid -> pid.getQualifier() != null)
|
.filter(pid -> pid.getQualifier() != null)
|
||||||
.filter(pid -> pid.getQualifier().getClassid() != null)
|
.filter(pid -> StringUtils.startsWithIgnoreCase(pid.getQualifier().getClassid(), ModelConstants.ORCID))
|
||||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
|
|
||||||
.map(StructuredProperty::getValue)
|
.map(StructuredProperty::getValue)
|
||||||
.map(ConversionUtils::cleanOrcid)
|
.map(ConversionUtils::cleanOrcid)
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>outputDir</name>
|
<name>outputDir</name>
|
||||||
<description>the path where the the generated data will be stored</description>
|
<description>the path where the generated data will be stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>datasourceIdWhitelist</name>
|
<name>datasourceIdWhitelist</name>
|
||||||
|
@ -179,17 +179,18 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>PrepareSimpleEntititiesJob</name>
|
<name>PrepareSimpleEntititiesJob</name>
|
||||||
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob</class>
|
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntitiesJob</class>
|
||||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -209,11 +210,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -234,11 +236,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -258,11 +261,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -282,11 +286,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=10000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -306,11 +311,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=2000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -332,11 +338,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -356,11 +363,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -380,11 +388,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -404,11 +413,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -428,11 +438,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -452,11 +463,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
@ -476,11 +488,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
|
@ -503,6 +516,7 @@
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -535,6 +549,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -562,6 +577,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -585,6 +601,7 @@
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
|
|
||||||
|
class EnrichMissingAuthorOrcidTest {
|
||||||
|
|
||||||
|
final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws Exception {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFindDifferences_1() {
|
||||||
|
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||||
|
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||||
|
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||||
|
assertTrue(list.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFindDifferences_2() {
|
||||||
|
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||||
|
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||||
|
|
||||||
|
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||||
|
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
|
||||||
|
|
||||||
|
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||||
|
assertEquals(1, list.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFindDifferences_3() {
|
||||||
|
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||||
|
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||||
|
|
||||||
|
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
|
||||||
|
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||||
|
|
||||||
|
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||||
|
assertTrue(list.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testFindDifferences_4() {
|
||||||
|
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||||
|
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||||
|
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||||
|
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||||
|
|
||||||
|
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||||
|
assertTrue(list.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -2,27 +2,32 @@
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
class ConversionUtilsTest {
|
public class ConversionUtilsTest {
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testAllResultPids() {
|
public void testAllResultPids() {
|
||||||
final Qualifier qf = new Qualifier();
|
final Qualifier qf = new Qualifier();
|
||||||
qf.setClassid("test");
|
qf.setClassid("test");
|
||||||
qf.setClassname("test");
|
qf.setClassname("test");
|
||||||
|
@ -91,4 +96,42 @@ class ConversionUtilsTest {
|
||||||
assertEquals(6, list.size());
|
assertEquals(6, list.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testOafResultToBrokerResult() {
|
||||||
|
|
||||||
|
final Author a1 = createAuthor("Michele Artini", "0000-0002-4406-428X");
|
||||||
|
final Author a2 = createAuthor("Claudio Atzori", "http://orcid.org/0000-0001-9613-6639");
|
||||||
|
final Author a3 = createAuthor("Alessia Bardi", null);
|
||||||
|
|
||||||
|
final Result r = new Result();
|
||||||
|
r.setAuthor(Arrays.asList(a1, a2, a3));
|
||||||
|
|
||||||
|
final OaBrokerMainEntity br = ConversionUtils.oafResultToBrokerResult(r);
|
||||||
|
|
||||||
|
assertEquals(3, br.getCreators().size());
|
||||||
|
assertEquals("0000-0002-4406-428X", br.getCreators().get(0).getOrcid());
|
||||||
|
assertEquals("0000-0001-9613-6639", br.getCreators().get(1).getOrcid());
|
||||||
|
assertNull(br.getCreators().get(2).getOrcid());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Author createAuthor(final String name, final String orcid) {
|
||||||
|
|
||||||
|
final Author a = new Author();
|
||||||
|
a.setFullname("Michele Artini");
|
||||||
|
|
||||||
|
if (orcid != null) {
|
||||||
|
final Qualifier q = new Qualifier();
|
||||||
|
q.setClassid(ModelConstants.ORCID);
|
||||||
|
q.setClassname(ModelConstants.ORCID);
|
||||||
|
q.setSchemeid("dnet:pids");
|
||||||
|
q.setSchemename("dnet:pids");
|
||||||
|
|
||||||
|
final StructuredProperty pid = new StructuredProperty();
|
||||||
|
pid.setQualifier(q);
|
||||||
|
pid.setValue(orcid);
|
||||||
|
|
||||||
|
a.setPid(Arrays.asList(pid));
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,24 +53,10 @@
|
||||||
<artifactId>dhp-pace-core</artifactId>
|
<artifactId>dhp-pace-core</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
|
||||||
<artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
|
|
||||||
<version>1.0.2</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
|
||||||
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
|
|
||||||
<version>2.11.0</version>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
|
@ -79,16 +65,10 @@
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.arakelian</groupId>
|
|
||||||
<artifactId>java-jq</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>dom4j</groupId>
|
<groupId>dom4j</groupId>
|
||||||
<artifactId>dom4j</artifactId>
|
<artifactId>dom4j</artifactId>
|
||||||
|
@ -101,10 +81,6 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-databind</artifactId>
|
<artifactId>jackson-databind</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
|
||||||
<artifactId>jackson-core</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
|
|
|
@ -42,6 +42,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.util.SparkCompatUtils;
|
||||||
import scala.Tuple3;
|
import scala.Tuple3;
|
||||||
import scala.collection.JavaConversions;
|
import scala.collection.JavaConversions;
|
||||||
|
|
||||||
|
@ -148,8 +149,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
Dataset<Row> pivotHistory = spark
|
Dataset<Row> pivotHistory = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
RowEncoder
|
SparkCompatUtils.encoderFor(StructType.fromDDL("id STRING, lastUsage STRING")));
|
||||||
.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
|
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
|
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
|
||||||
pivotHistory = spark
|
pivotHistory = spark
|
||||||
|
@ -203,8 +203,8 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
WindowSpec w = Window
|
WindowSpec w = Window
|
||||||
.partitionBy("groupId")
|
.partitionBy("groupId")
|
||||||
.orderBy(
|
.orderBy(
|
||||||
col("lastUsage").desc_nulls_last(),
|
|
||||||
col("pidType").asc_nulls_last(),
|
col("pidType").asc_nulls_last(),
|
||||||
|
col("lastUsage").desc_nulls_last(),
|
||||||
col("collectedfrom").desc_nulls_last(),
|
col("collectedfrom").desc_nulls_last(),
|
||||||
col("date").asc_nulls_last(),
|
col("date").asc_nulls_last(),
|
||||||
col("id").asc_nulls_last());
|
col("id").asc_nulls_last());
|
||||||
|
|
|
@ -22,7 +22,9 @@ import eu.dnetlib.dhp.oa.dedup.model.OrgSimRel;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
@ -164,12 +166,12 @@ public class SparkPrepareNewOrgs extends AbstractSparkAction {
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
|
(MapFunction<Tuple2<Tuple2<String, Organization>, Tuple2<String, String>>, OrgSimRel>) r -> new OrgSimRel(
|
||||||
"",
|
"",
|
||||||
r._1()._2().getOriginalId().get(0),
|
Optional.ofNullable(r._1()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null),
|
||||||
r._1()._2().getLegalname() != null ? r._1()._2().getLegalname().getValue() : "",
|
Optional.ofNullable(r._1()._2().getLegalname()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getLegalshortname() != null ? r._1()._2().getLegalshortname().getValue() : "",
|
Optional.ofNullable(r._1()._2().getLegalshortname()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getCountry() != null ? r._1()._2().getCountry().getClassid() : "",
|
Optional.ofNullable(r._1()._2().getCountry()).map(Qualifier::getClassid).orElse(""),
|
||||||
r._1()._2().getWebsiteurl() != null ? r._1()._2().getWebsiteurl().getValue() : "",
|
Optional.ofNullable(r._1()._2().getWebsiteurl()).map(Field::getValue).orElse(""),
|
||||||
r._1()._2().getCollectedfrom().get(0).getValue(),
|
Optional.ofNullable(r._1()._2().getCollectedfrom()).map(cf -> cf.get(0).getValue()).orElse(null),
|
||||||
"",
|
"",
|
||||||
structuredPropertyListToString(r._1()._2().getPid()),
|
structuredPropertyListToString(r._1()._2().getPid()),
|
||||||
parseECField(r._1()._2().getEclegalbody()),
|
parseECField(r._1()._2().getEclegalbody()),
|
||||||
|
|
|
@ -217,7 +217,7 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
final Organization o = r._2()._2();
|
final Organization o = r._2()._2();
|
||||||
return new OrgSimRel(
|
return new OrgSimRel(
|
||||||
r._1()._1(),
|
r._1()._1(),
|
||||||
o.getOriginalId().get(0),
|
Optional.ofNullable(o.getOriginalId()).map(oid -> oid.get(0)).orElse(null),
|
||||||
Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
|
Optional.ofNullable(o.getLegalname()).map(Field::getValue).orElse(""),
|
||||||
Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
|
Optional.ofNullable(o.getLegalshortname()).map(Field::getValue).orElse(""),
|
||||||
Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
|
Optional.ofNullable(o.getCountry()).map(Qualifier::getClassid).orElse(""),
|
||||||
|
@ -249,7 +249,9 @@ public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
|
(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
|
||||||
OrgSimRel orgSimRel = r._1()._2();
|
OrgSimRel orgSimRel = r._1()._2();
|
||||||
orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
|
orgSimRel
|
||||||
|
.setLocal_id(
|
||||||
|
Optional.ofNullable(r._2()._2().getOriginalId()).map(oid -> oid.get(0)).orElse(null));
|
||||||
return orgSimRel;
|
return orgSimRel;
|
||||||
},
|
},
|
||||||
Encoders.bean(OrgSimRel.class));
|
Encoders.bean(OrgSimRel.class));
|
||||||
|
|
|
@ -8,7 +8,6 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.ReduceFunction;
|
import org.apache.spark.api.java.function.ReduceFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
|
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -23,6 +22,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.util.SparkCompatUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
import scala.Tuple3;
|
import scala.Tuple3;
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
|
||||||
StructType idsSchema = StructType
|
StructType idsSchema = StructType
|
||||||
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
.fromDDL("`id` STRING, `dataInfo` STRUCT<`deletedbyinference`:BOOLEAN,`invisible`:BOOLEAN>");
|
||||||
|
|
||||||
Dataset<Row> allIds = spark.emptyDataset(RowEncoder.apply(idsSchema));
|
Dataset<Row> allIds = spark.emptyDataset(SparkCompatUtils.encoderFor(idsSchema));
|
||||||
|
|
||||||
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
for (EntityType entityType : ModelSupport.entityTypes.keySet()) {
|
||||||
String entityPath = graphBasePath + '/' + entityType.name();
|
String entityPath = graphBasePath + '/' + entityType.name();
|
||||||
|
|
|
@ -15,4 +15,12 @@
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark2</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveMetastoreUris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>pivotHistoryDatabase</name>
|
||||||
|
<value>​</value>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -198,6 +198,8 @@
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="PrepareOrgRels"/>
|
<ok to="PrepareOrgRels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -0,0 +1,103 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.MergeUtils;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
class DatasetMergerTest implements Serializable {
|
||||||
|
|
||||||
|
private List<Tuple2<String, Dataset>> datasets;
|
||||||
|
|
||||||
|
private String testEntityBasePath;
|
||||||
|
private DataInfo dataInfo;
|
||||||
|
private final String dedupId = "50|doi_________::3d18564ef27ebe9ef3bd8b4dec67e148";
|
||||||
|
private Dataset dataset_top;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
testEntityBasePath = Paths
|
||||||
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
||||||
|
.toFile()
|
||||||
|
.getAbsolutePath();
|
||||||
|
|
||||||
|
datasets = readSample(testEntityBasePath + "/dataset_merge.json", Dataset.class);
|
||||||
|
|
||||||
|
dataset_top = getTopPub(datasets);
|
||||||
|
|
||||||
|
dataInfo = setDI();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
||||||
|
Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator());
|
||||||
|
|
||||||
|
// verify id
|
||||||
|
assertEquals(dedupId, pub_merged.getId());
|
||||||
|
assertEquals(2, pub_merged.getInstance().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public DataInfo setDI() {
|
||||||
|
DataInfo dataInfo = new DataInfo();
|
||||||
|
dataInfo.setTrust("0.9");
|
||||||
|
dataInfo.setDeletedbyinference(false);
|
||||||
|
dataInfo.setInferenceprovenance("testing");
|
||||||
|
dataInfo.setInferred(true);
|
||||||
|
return dataInfo;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Dataset getTopPub(List<Tuple2<String, Dataset>> publications) {
|
||||||
|
|
||||||
|
Double maxTrust = 0.0;
|
||||||
|
Dataset maxPub = new Dataset();
|
||||||
|
for (Tuple2<String, Dataset> publication : publications) {
|
||||||
|
Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
|
||||||
|
if (pubTrust > maxTrust) {
|
||||||
|
maxTrust = pubTrust;
|
||||||
|
maxPub = publication._2();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return maxPub;
|
||||||
|
}
|
||||||
|
|
||||||
|
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||||
|
List<Tuple2<String, T>> res = new ArrayList<>();
|
||||||
|
BufferedReader reader;
|
||||||
|
try {
|
||||||
|
reader = new BufferedReader(new FileReader(path));
|
||||||
|
String line = reader.readLine();
|
||||||
|
while (line != null) {
|
||||||
|
res
|
||||||
|
.add(
|
||||||
|
new Tuple2<>(
|
||||||
|
MapDocumentUtil.getJPathString("$.id", line),
|
||||||
|
new ObjectMapper().readValue(line, clazz)));
|
||||||
|
// read next line
|
||||||
|
line = reader.readLine();
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -93,14 +93,14 @@ class EntityMergerTest implements Serializable {
|
||||||
assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
|
assertEquals(pub_top.getJournal().getConferencedate(), pub_merged.getJournal().getConferencedate());
|
||||||
assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
|
assertEquals(pub_top.getJournal().getConferenceplace(), pub_merged.getJournal().getConferenceplace());
|
||||||
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
|
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
|
||||||
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
|
assertEquals(pub_top.getResulttype().getClassid(), pub_merged.getResulttype().getClassid());
|
||||||
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
|
assertEquals(pub_top.getLanguage().getClassid(), pub_merged.getLanguage().getClassid());
|
||||||
assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
|
assertEquals("Elsevier BV", pub_merged.getPublisher().getValue());
|
||||||
assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
|
assertEquals(pub_top.getEmbargoenddate().getValue(), pub_merged.getEmbargoenddate().getValue());
|
||||||
assertEquals(pub_top.getResourcetype().getClassid(), "");
|
assertEquals(pub_top.getResourcetype().getClassid(), "");
|
||||||
assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
|
assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
|
||||||
assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
|
assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
|
||||||
assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
|
// assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
|
||||||
assertEquals(3, pub_merged.getInstance().size());
|
assertEquals(3, pub_merged.getInstance().size());
|
||||||
assertEquals(2, pub_merged.getCountry().size());
|
assertEquals(2, pub_merged.getCountry().size());
|
||||||
assertEquals(0, pub_merged.getSubject().size());
|
assertEquals(0, pub_merged.getSubject().size());
|
||||||
|
|
|
@ -49,7 +49,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"field": "country",
|
"field": "country",
|
||||||
"comparator": "exactMatch",
|
"comparator": "countryMatch",
|
||||||
"weight": 1,
|
"weight": 1,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -172,7 +172,7 @@ public class SparkBulkTagJob {
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + "project");
|
.json(outputPath + "project");
|
||||||
|
|
||||||
readPath(spark, outputPath + "project", Datasource.class)
|
readPath(spark, outputPath + "project", Project.class)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
|
@ -61,7 +61,8 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
||||||
subject.getQualifier().setClassname(vocabulary.getName());
|
subject.getQualifier().setClassname(vocabulary.getName());
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
final String provenanceActionClassId = Optional.ofNullable(subject.getDataInfo())
|
final String provenanceActionClassId = Optional
|
||||||
|
.ofNullable(subject.getDataInfo())
|
||||||
.map(DataInfo::getProvenanceaction)
|
.map(DataInfo::getProvenanceaction)
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassid)
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
|
|
|
@ -398,6 +398,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
|
o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
|
||||||
o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
|
o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
|
||||||
o.setCountry(prepareQualifierSplitting(rs.getString("country")));
|
o.setCountry(prepareQualifierSplitting(rs.getString("country")));
|
||||||
|
o.setOrganizationType(Organization.OrganizationType.valueOf(rs.getString("typology")));
|
||||||
o.setDataInfo(info);
|
o.setDataInfo(info);
|
||||||
o.setLastupdatetimestamp(lastUpdateTimestamp);
|
o.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
|
||||||
|
|
|
@ -156,6 +156,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -190,6 +191,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -224,6 +226,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -258,6 +261,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -292,6 +296,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -326,6 +331,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -360,6 +366,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -394,6 +401,7 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -116,17 +116,19 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=10000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
<arg>--numPartitions</arg><arg>8000</arg>
|
<arg>--numPartitions</arg><arg>10000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_import"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -143,17 +145,19 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=4000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
<arg>--numPartitions</arg><arg>4000</arg>
|
<arg>--numPartitions</arg><arg>8000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_import"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -170,11 +174,13 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
|
@ -197,17 +203,19 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
<arg>--numPartitions</arg><arg>300</arg>
|
<arg>--numPartitions</arg><arg>1000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_import"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -224,17 +232,19 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=200
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/datasource</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
<arg>--numPartitions</arg><arg>100</arg>
|
<arg>--numPartitions</arg><arg>200</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_import"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -251,17 +261,19 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/organization</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/organization</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
<arg>--numPartitions</arg><arg>400</arg>
|
<arg>--numPartitions</arg><arg>1000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_import"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -278,17 +290,19 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=1000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/project</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/project</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
<arg>--numPartitions</arg><arg>100</arg>
|
<arg>--numPartitions</arg><arg>1000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_import"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -305,17 +319,19 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${inputPath}/relation</arg>
|
<arg>--inputPath</arg><arg>${inputPath}/relation</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
<arg>--numPartitions</arg><arg>10000</arg>
|
<arg>--numPartitions</arg><arg>15000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_import"/>
|
<ok to="join_import"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -45,6 +45,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -79,6 +80,7 @@
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.shuffle.partitions=10000
|
--conf spark.sql.shuffle.partitions=10000
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
|
|
@ -28,7 +28,8 @@ SELECT
|
||||||
(array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1] AS ecinternationalorganization,
|
(array_remove(array_cat(ARRAY[o.ec_internationalorganization], array_agg(od.ec_internationalorganization)), NULL))[1] AS ecinternationalorganization,
|
||||||
(array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1] AS ecenterprise,
|
(array_remove(array_cat(ARRAY[o.ec_enterprise], array_agg(od.ec_enterprise)), NULL))[1] AS ecenterprise,
|
||||||
(array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1] AS ecsmevalidated,
|
(array_remove(array_cat(ARRAY[o.ec_smevalidated], array_agg(od.ec_smevalidated)), NULL))[1] AS ecsmevalidated,
|
||||||
(array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1] AS ecnutscode
|
(array_remove(array_cat(ARRAY[o.ec_nutscode], array_agg(od.ec_nutscode)), NULL))[1] AS ecnutscode,
|
||||||
|
org_types.name AS typology
|
||||||
FROM organizations o
|
FROM organizations o
|
||||||
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
|
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
|
||||||
LEFT OUTER JOIN urls u ON (u.id = o.id)
|
LEFT OUTER JOIN urls u ON (u.id = o.id)
|
||||||
|
@ -37,6 +38,7 @@ FROM organizations o
|
||||||
LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different')
|
LEFT OUTER JOIN oa_duplicates d ON (o.id = d.local_id AND d.reltype != 'is_different')
|
||||||
LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
|
LEFT OUTER JOIN organizations od ON (d.oa_original_id = od.id)
|
||||||
LEFT OUTER JOIN other_ids idup ON (od.id = idup.id)
|
LEFT OUTER JOIN other_ids idup ON (od.id = idup.id)
|
||||||
|
LEFT OUTER JOIN org_types ON (org_types.val = o.type)
|
||||||
WHERE
|
WHERE
|
||||||
o.status = 'approved' OR o.status = 'suggested'
|
o.status = 'approved' OR o.status = 'suggested'
|
||||||
GROUP BY
|
GROUP BY
|
||||||
|
@ -44,4 +46,5 @@ GROUP BY
|
||||||
o.name,
|
o.name,
|
||||||
o.creation_date,
|
o.creation_date,
|
||||||
o.modification_date,
|
o.modification_date,
|
||||||
o.country;
|
o.country,
|
||||||
|
org_types.name;
|
|
@ -0,0 +1,5 @@
|
||||||
|
[
|
||||||
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": false},
|
||||||
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
|
||||||
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the scholix dump", "paramRequired": true}
|
||||||
|
]
|
|
@ -0,0 +1,166 @@
|
||||||
|
{
|
||||||
|
"cites":{
|
||||||
|
"original":"Cites",
|
||||||
|
"inverse":"IsCitedBy"
|
||||||
|
},
|
||||||
|
"compiles":{
|
||||||
|
"original":"Compiles",
|
||||||
|
"inverse":"IsCompiledBy"
|
||||||
|
},
|
||||||
|
"continues":{
|
||||||
|
"original":"Continues",
|
||||||
|
"inverse":"IsContinuedBy"
|
||||||
|
},
|
||||||
|
"derives":{
|
||||||
|
"original":"IsSourceOf",
|
||||||
|
"inverse":"IsDerivedFrom"
|
||||||
|
},
|
||||||
|
"describes":{
|
||||||
|
"original":"Describes",
|
||||||
|
"inverse":"IsDescribedBy"
|
||||||
|
},
|
||||||
|
"documents":{
|
||||||
|
"original":"Documents",
|
||||||
|
"inverse":"IsDocumentedBy"
|
||||||
|
},
|
||||||
|
"hasmetadata":{
|
||||||
|
"original":"HasMetadata",
|
||||||
|
"inverse":"IsMetadataOf"
|
||||||
|
},
|
||||||
|
"hasassociationwith":{
|
||||||
|
"original":"HasAssociationWith",
|
||||||
|
"inverse":"HasAssociationWith"
|
||||||
|
},
|
||||||
|
"haspart":{
|
||||||
|
"original":"HasPart",
|
||||||
|
"inverse":"IsPartOf"
|
||||||
|
},
|
||||||
|
"hasversion":{
|
||||||
|
"original":"HasVersion",
|
||||||
|
"inverse":"IsVersionOf"
|
||||||
|
},
|
||||||
|
"iscitedby":{
|
||||||
|
"original":"IsCitedBy",
|
||||||
|
"inverse":"Cites"
|
||||||
|
},
|
||||||
|
"iscompiledby":{
|
||||||
|
"original":"IsCompiledBy",
|
||||||
|
"inverse":"Compiles"
|
||||||
|
},
|
||||||
|
"iscontinuedby":{
|
||||||
|
"original":"IsContinuedBy",
|
||||||
|
"inverse":"Continues"
|
||||||
|
},
|
||||||
|
"isderivedfrom":{
|
||||||
|
"original":"IsDerivedFrom",
|
||||||
|
"inverse":"IsSourceOf"
|
||||||
|
},
|
||||||
|
"isdescribedby":{
|
||||||
|
"original":"IsDescribedBy",
|
||||||
|
"inverse":"Describes"
|
||||||
|
},
|
||||||
|
"isdocumentedby":{
|
||||||
|
"original":"IsDocumentedBy",
|
||||||
|
"inverse":"Documents"
|
||||||
|
},
|
||||||
|
"isidenticalto":{
|
||||||
|
"original":"IsIdenticalTo",
|
||||||
|
"inverse":"IsIdenticalTo"
|
||||||
|
},
|
||||||
|
"ismetadatafor":{
|
||||||
|
"original":"IsMetadataFor",
|
||||||
|
"inverse":"IsMetadataOf"
|
||||||
|
},
|
||||||
|
"ismetadataof":{
|
||||||
|
"original":"IsMetadataOf",
|
||||||
|
"inverse":"IsMetadataFor"
|
||||||
|
},
|
||||||
|
"isnewversionof":{
|
||||||
|
"original":"IsNewVersionOf",
|
||||||
|
"inverse":"IsPreviousVersionOf"
|
||||||
|
},
|
||||||
|
"isobsoletedby":{
|
||||||
|
"original":"IsObsoletedBy",
|
||||||
|
"inverse":"Obsoletes"
|
||||||
|
},
|
||||||
|
"isoriginalformof":{
|
||||||
|
"original":"IsOriginalFormOf",
|
||||||
|
"inverse":"IsVariantFormOf"
|
||||||
|
},
|
||||||
|
"ispartof":{
|
||||||
|
"original":"IsPartOf",
|
||||||
|
"inverse":"HasPart"
|
||||||
|
},
|
||||||
|
"ispreviousversionof":{
|
||||||
|
"original":"IsPreviousVersionOf",
|
||||||
|
"inverse":"IsNewVersionOf"
|
||||||
|
},
|
||||||
|
"isreferencedby":{
|
||||||
|
"original":"IsReferencedBy",
|
||||||
|
"inverse":"References"
|
||||||
|
},
|
||||||
|
"isrelatedto":{
|
||||||
|
"original":"IsRelatedTo",
|
||||||
|
"inverse":"IsRelatedTo"
|
||||||
|
},
|
||||||
|
"isrequiredby":{
|
||||||
|
"original":"IsRequiredBy",
|
||||||
|
"inverse":"Requires"
|
||||||
|
},
|
||||||
|
"isreviewedby":{
|
||||||
|
"original":"IsReviewedBy",
|
||||||
|
"inverse":"Reviews"
|
||||||
|
},
|
||||||
|
"issourceof":{
|
||||||
|
"original":"IsSourceOf",
|
||||||
|
"inverse":"IsDerivedFrom"
|
||||||
|
},
|
||||||
|
"issupplementedby":{
|
||||||
|
"original":"IsSupplementedBy",
|
||||||
|
"inverse":"IsSupplementTo"
|
||||||
|
},
|
||||||
|
"issupplementto":{
|
||||||
|
"original":"IsSupplementTo",
|
||||||
|
"inverse":"IsSupplementedBy"
|
||||||
|
},
|
||||||
|
"isvariantformof":{
|
||||||
|
"original":"IsVariantFormOf",
|
||||||
|
"inverse":"IsOriginalFormOf"
|
||||||
|
},
|
||||||
|
"isversionof":{
|
||||||
|
"original":"IsVersionOf",
|
||||||
|
"inverse":"HasVersion"
|
||||||
|
},
|
||||||
|
"obsoletes":{
|
||||||
|
"original":"Obsoletes",
|
||||||
|
"inverse":"IsObsoletedBy"
|
||||||
|
},
|
||||||
|
"references":{
|
||||||
|
"original":"References",
|
||||||
|
"inverse":"IsReferencedBy"
|
||||||
|
},
|
||||||
|
"requires":{
|
||||||
|
"original":"Requires",
|
||||||
|
"inverse":"IsRequiredBy"
|
||||||
|
},
|
||||||
|
"related":{
|
||||||
|
"original":"IsRelatedTo",
|
||||||
|
"inverse":"IsRelatedTo"
|
||||||
|
},
|
||||||
|
"reviews":{
|
||||||
|
"original":"Reviews",
|
||||||
|
"inverse":"IsReviewedBy"
|
||||||
|
},
|
||||||
|
"unknown":{
|
||||||
|
"original":"Unknown",
|
||||||
|
"inverse":"Unknown"
|
||||||
|
},
|
||||||
|
"isamongtopnsimilardocuments": {
|
||||||
|
"original": "IsAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "HasAmongTopNSimilarDocuments"
|
||||||
|
},
|
||||||
|
"hasamongtopnsimilardocuments": {
|
||||||
|
"original": "HasAmongTopNSimilarDocuments",
|
||||||
|
"inverse": "IsAmongTopNSimilarDocuments"
|
||||||
|
}
|
||||||
|
}
|
|
@ -25,6 +25,22 @@ object SparkApplyHostedByMapToResult {
|
||||||
val i = p.getInstance().asScala
|
val i = p.getInstance().asScala
|
||||||
if (i.size == 1) {
|
if (i.size == 1) {
|
||||||
val inst: Instance = i.head
|
val inst: Instance = i.head
|
||||||
|
patchInstance(p, ei, inst)
|
||||||
|
|
||||||
|
} else {
|
||||||
|
val cf = i.map(ii => ii.getCollectedfrom.getValue)
|
||||||
|
if (cf.contains("Crossref")) {
|
||||||
|
i.foreach(ii => {
|
||||||
|
patchInstance(p, ei, ii)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
p
|
||||||
|
})(Encoders.bean(classOf[Publication]))
|
||||||
|
}
|
||||||
|
|
||||||
|
private def patchInstance(p: Publication, ei: EntityInfo, inst: Instance): Unit = {
|
||||||
inst.getHostedby.setKey(ei.getHostedById)
|
inst.getHostedby.setKey(ei.getHostedById)
|
||||||
inst.getHostedby.setValue(ei.getName)
|
inst.getHostedby.setValue(ei.getName)
|
||||||
if (ei.getOpenAccess) {
|
if (ei.getOpenAccess) {
|
||||||
|
@ -39,11 +55,6 @@ object SparkApplyHostedByMapToResult {
|
||||||
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
inst.getAccessright.setOpenAccessRoute(OpenAccessRoute.gold)
|
||||||
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
|
p.setBestaccessright(OafMapperUtils.createBestAccessRights(p.getInstance()));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p
|
|
||||||
})(Encoders.bean(classOf[Publication]))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
|
@ -0,0 +1,258 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{KeyValue, Result, StructuredProperty}
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.{
|
||||||
|
Scholix,
|
||||||
|
ScholixCollectedFrom,
|
||||||
|
ScholixEntityId,
|
||||||
|
ScholixIdentifier,
|
||||||
|
ScholixRelationship,
|
||||||
|
ScholixResource
|
||||||
|
}
|
||||||
|
import org.json4s
|
||||||
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
|
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.io.Source
|
||||||
|
|
||||||
|
case class RelationInfo(
|
||||||
|
source: String,
|
||||||
|
target: String,
|
||||||
|
relclass: String,
|
||||||
|
id: String,
|
||||||
|
collectedfrom: Seq[RelKeyValue]
|
||||||
|
) {}
|
||||||
|
case class RelKeyValue(key: String, value: String) {}
|
||||||
|
|
||||||
|
object ScholexplorerUtils {
|
||||||
|
|
||||||
|
val OPENAIRE_IDENTIFIER_SCHEMA: String = "OpenAIRE Identifier"
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
|
case class RelationVocabulary(original: String, inverse: String) {}
|
||||||
|
|
||||||
|
val relations: Map[String, RelationVocabulary] = {
|
||||||
|
val input = Source
|
||||||
|
.fromInputStream(
|
||||||
|
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/relation/relations.json")
|
||||||
|
)
|
||||||
|
.mkString
|
||||||
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
|
|
||||||
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
|
||||||
|
json.extract[Map[String, RelationVocabulary]]
|
||||||
|
}
|
||||||
|
|
||||||
|
def invRel(rel: String): String = {
|
||||||
|
val semanticRelation = relations.getOrElse(rel.toLowerCase, null)
|
||||||
|
if (semanticRelation != null)
|
||||||
|
semanticRelation.inverse
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateDatasourceOpenAIREURLS(id: String): String = {
|
||||||
|
if (id != null && id.length > 12)
|
||||||
|
s"https://explore.openaire.eu/search/dataprovider?datasourceId=${id.substring(3)}"
|
||||||
|
else
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
def findURLForPID(
|
||||||
|
pidValue: List[StructuredProperty],
|
||||||
|
urls: List[String]
|
||||||
|
): List[(StructuredProperty, String)] = {
|
||||||
|
pidValue.map { p =>
|
||||||
|
val pv = p.getValue
|
||||||
|
|
||||||
|
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||||
|
(p, r.orNull)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||||
|
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||||
|
return List()
|
||||||
|
r.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||||
|
.filter(i => i.getPid != null && i.getUrl != null)
|
||||||
|
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||||
|
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||||
|
.distinct
|
||||||
|
.toList
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholixResourceFromResult(result: Result): ScholixResource = {
|
||||||
|
|
||||||
|
if (result.getInstance() == null || result.getInstance().size() == 0)
|
||||||
|
return null
|
||||||
|
|
||||||
|
if (result.getPid == null || result.getPid.isEmpty)
|
||||||
|
return null
|
||||||
|
|
||||||
|
val r = new ScholixResource
|
||||||
|
r.setDnetIdentifier(result.getId)
|
||||||
|
|
||||||
|
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(result)
|
||||||
|
if (persistentIdentifiers.isEmpty)
|
||||||
|
return null
|
||||||
|
|
||||||
|
r.setIdentifier(persistentIdentifiers.asJava)
|
||||||
|
|
||||||
|
r.setObjectType(result.getResulttype.getClassid)
|
||||||
|
|
||||||
|
r.setObjectSubType(
|
||||||
|
result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i != null && i.getInstancetype != null)
|
||||||
|
.map(i => i.getInstancetype.getClassname)
|
||||||
|
.distinct
|
||||||
|
.head
|
||||||
|
)
|
||||||
|
|
||||||
|
if (result.getTitle != null && result.getTitle.asScala.nonEmpty) {
|
||||||
|
val titles: List[String] = result.getTitle.asScala.map(t => t.getValue).toList
|
||||||
|
if (titles.nonEmpty)
|
||||||
|
r.setTitle(titles.head)
|
||||||
|
else
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
if (result.getAuthor != null && !result.getAuthor.isEmpty) {
|
||||||
|
val authors: List[ScholixEntityId] =
|
||||||
|
result.getAuthor.asScala
|
||||||
|
.map(a => {
|
||||||
|
val entity = new ScholixEntityId()
|
||||||
|
entity.setName(a.getFullname)
|
||||||
|
if (a.getPid != null && a.getPid.size() > 0)
|
||||||
|
entity.setIdentifiers(
|
||||||
|
a.getPid.asScala
|
||||||
|
.map(sp => {
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(sp.getValue)
|
||||||
|
id.setSchema(sp.getQualifier.getClassid)
|
||||||
|
id
|
||||||
|
})
|
||||||
|
.take(3)
|
||||||
|
.toList
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
entity
|
||||||
|
})
|
||||||
|
.toList
|
||||||
|
if (authors.nonEmpty)
|
||||||
|
r.setCreator(authors.asJava)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
val dt: List[String] = result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.filter(i => i.getDateofacceptance != null)
|
||||||
|
.map(i => i.getDateofacceptance.getValue)
|
||||||
|
.toList
|
||||||
|
if (dt.nonEmpty)
|
||||||
|
r.setPublicationDate(dt.distinct.head)
|
||||||
|
|
||||||
|
r.setPublisher(
|
||||||
|
result
|
||||||
|
.getInstance()
|
||||||
|
.asScala
|
||||||
|
.map(i => i.getHostedby)
|
||||||
|
.filter(h => !"unknown".equalsIgnoreCase(h.getValue))
|
||||||
|
.map(h => {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(h.getValue)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(h.getKey)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(h.getKey))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
eid
|
||||||
|
})
|
||||||
|
.distinct
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
|
r.setCollectedFrom(
|
||||||
|
result.getCollectedfrom.asScala
|
||||||
|
.map(cf => {
|
||||||
|
val scf = new ScholixCollectedFrom()
|
||||||
|
scf.setProvisionMode("collected")
|
||||||
|
scf.setCompletionStatus("complete")
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(cf.getValue)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(cf.getKey)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(cf.getKey))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
scf.setProvider(eid)
|
||||||
|
scf
|
||||||
|
})
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
|
||||||
|
r
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholix(relation: RelationInfo, source: ScholixResource): Scholix = {
|
||||||
|
val s: Scholix = new Scholix
|
||||||
|
s.setSource(source)
|
||||||
|
if (relation.collectedfrom != null && relation.collectedfrom.nonEmpty)
|
||||||
|
s.setLinkprovider(
|
||||||
|
relation.collectedfrom
|
||||||
|
.map(cf => {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName(cf.value)
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier(cf.key)
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(cf.key))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
eid
|
||||||
|
})
|
||||||
|
.toList
|
||||||
|
.asJava
|
||||||
|
)
|
||||||
|
else {
|
||||||
|
val eid = new ScholixEntityId()
|
||||||
|
eid.setName("OpenAIRE")
|
||||||
|
val id = new ScholixIdentifier()
|
||||||
|
id.setIdentifier("10|infrastruct_::f66f1bd369679b5b077dcdf006089556")
|
||||||
|
id.setSchema(OPENAIRE_IDENTIFIER_SCHEMA)
|
||||||
|
id.setUrl(generateDatasourceOpenAIREURLS(id.getIdentifier))
|
||||||
|
eid.setIdentifiers(List(id).asJava)
|
||||||
|
s.setLinkprovider(List(eid).asJava)
|
||||||
|
}
|
||||||
|
s.setIdentifier(relation.id)
|
||||||
|
val semanticRelation = relations.getOrElse(relation.relclass.toLowerCase, null)
|
||||||
|
if (semanticRelation == null)
|
||||||
|
return null
|
||||||
|
s.setRelationship(
|
||||||
|
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||||
|
)
|
||||||
|
s.setPublicationDate(source.getPublicationDate)
|
||||||
|
s.setPublisher(source.getPublisher)
|
||||||
|
val mockTarget = new ScholixResource
|
||||||
|
mockTarget.setDnetIdentifier(relation.target)
|
||||||
|
s.setTarget(mockTarget)
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
|
def updateTarget(s: Scholix, t: ScholixResource): String = {
|
||||||
|
|
||||||
|
s.setTarget(t)
|
||||||
|
val spublishers: Seq[ScholixEntityId] =
|
||||||
|
if (s.getPublisher != null && !s.getPublisher.isEmpty) s.getPublisher.asScala else List()
|
||||||
|
val tpublishers: Seq[ScholixEntityId] =
|
||||||
|
if (t.getPublisher != null && !t.getPublisher.isEmpty) t.getPublisher.asScala else List()
|
||||||
|
val mergedPublishers = spublishers.union(tpublishers).distinct.take(10).toList
|
||||||
|
s.setPublisher(mergedPublishers.asJava)
|
||||||
|
mapper.writeValueAsString(s)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,141 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{
|
||||||
|
KeyValue,
|
||||||
|
OtherResearchProduct,
|
||||||
|
Publication,
|
||||||
|
Relation,
|
||||||
|
Result,
|
||||||
|
Software,
|
||||||
|
Dataset => OafDataset
|
||||||
|
}
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
|
||||||
|
import org.apache.spark.sql.functions.{col, concat, expr, first, md5}
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
import org.apache.spark.sql._
|
||||||
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
|
class SparkCreateScholexplorerDump(propertyPath: String, args: Array[String], log: Logger)
|
||||||
|
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||||
|
|
||||||
|
/** Here all the spark applications runs this method
|
||||||
|
* where the whole logic of the spark node is defined
|
||||||
|
*/
|
||||||
|
override def run(): Unit = {
|
||||||
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
log.info("sourcePath: {}", sourcePath)
|
||||||
|
val targetPath = parser.get("targetPath")
|
||||||
|
log.info("targetPath: {}", targetPath)
|
||||||
|
generateBidirectionalRelations(sourcePath, targetPath, spark)
|
||||||
|
generateScholixResource(sourcePath, targetPath, spark)
|
||||||
|
generateScholix(targetPath, spark)
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholixResource(inputPath: String, outputPath: String, spark: SparkSession): Unit = {
|
||||||
|
val entityMap: Map[String, StructType] = Map(
|
||||||
|
"publication" -> Encoders.bean(classOf[Publication]).schema,
|
||||||
|
"dataset" -> Encoders.bean(classOf[OafDataset]).schema,
|
||||||
|
"software" -> Encoders.bean(classOf[Software]).schema,
|
||||||
|
"otherresearchproduct" -> Encoders.bean(classOf[OtherResearchProduct]).schema
|
||||||
|
)
|
||||||
|
|
||||||
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
|
||||||
|
implicit val resultEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
|
||||||
|
|
||||||
|
val resDs = spark.emptyDataset[ScholixResource]
|
||||||
|
val scholixResourceDS = entityMap.foldLeft[Dataset[ScholixResource]](resDs)((res, item) => {
|
||||||
|
println(s"adding ${item._1}")
|
||||||
|
res.union(
|
||||||
|
spark.read
|
||||||
|
.schema(item._2)
|
||||||
|
.json(s"$inputPath/${item._1}")
|
||||||
|
.as[Result]
|
||||||
|
.map(r => ScholexplorerUtils.generateScholixResourceFromResult(r))
|
||||||
|
.filter(s => s != null)
|
||||||
|
)
|
||||||
|
})
|
||||||
|
scholixResourceDS.write.mode(SaveMode.Overwrite).save(s"$outputPath/resource")
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateBidirectionalRelations(inputPath: String, otuputPath: String, spark: SparkSession): Unit = {
|
||||||
|
val relSchema = Encoders.bean(classOf[Relation]).schema
|
||||||
|
|
||||||
|
val relDF = spark.read
|
||||||
|
.schema(relSchema)
|
||||||
|
.json(s"$inputPath/relation")
|
||||||
|
.where(
|
||||||
|
"datainfo.deletedbyinference is false and source like '50%' and target like '50%' " +
|
||||||
|
"and relClass <> 'merges' and relClass <> 'isMergedIn'"
|
||||||
|
)
|
||||||
|
.select("source", "target", "collectedfrom", "relClass")
|
||||||
|
|
||||||
|
def invRel: String => String = { s =>
|
||||||
|
ScholexplorerUtils.invRel(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
import org.apache.spark.sql.functions.udf
|
||||||
|
val inverseRelationUDF = udf(invRel)
|
||||||
|
val inverseRelation = relDF.select(
|
||||||
|
col("target").alias("source"),
|
||||||
|
col("source").alias("target"),
|
||||||
|
col("collectedfrom"),
|
||||||
|
inverseRelationUDF(col("relClass")).alias("relClass")
|
||||||
|
)
|
||||||
|
|
||||||
|
val bidRel = inverseRelation
|
||||||
|
.union(relDF)
|
||||||
|
.withColumn("id", md5(concat(col("source"), col("relClass"), col("target"))))
|
||||||
|
.withColumn("cf", expr("transform(collectedfrom, x -> struct(x.key, x.value))"))
|
||||||
|
.drop("collectedfrom")
|
||||||
|
.withColumnRenamed("cf", "collectedfrom")
|
||||||
|
.groupBy(col("id"))
|
||||||
|
.agg(
|
||||||
|
first("source").alias("source"),
|
||||||
|
first("target").alias("target"),
|
||||||
|
first("relClass").alias("relClass"),
|
||||||
|
first("collectedfrom").alias("collectedfrom")
|
||||||
|
)
|
||||||
|
|
||||||
|
bidRel.write.mode(SaveMode.Overwrite).save(s"$otuputPath/relation")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def generateScholix(outputPath: String, spark: SparkSession): Unit = {
|
||||||
|
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.bean(classOf[ScholixResource])
|
||||||
|
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo(classOf[Scholix])
|
||||||
|
|
||||||
|
import spark.implicits._
|
||||||
|
val relations = spark.read.load(s"$outputPath/relation").as[RelationInfo]
|
||||||
|
val resource = spark.read.load(s"$outputPath/resource").as[ScholixResource]
|
||||||
|
|
||||||
|
val scholix_one_verse = relations
|
||||||
|
.joinWith(resource, relations("source") === resource("dnetIdentifier"), "inner")
|
||||||
|
.map(res => ScholexplorerUtils.generateScholix(res._1, res._2))
|
||||||
|
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[Scholix])))
|
||||||
|
|
||||||
|
val resourceTarget = relations
|
||||||
|
.joinWith(resource, relations("target") === resource("dnetIdentifier"), "inner")
|
||||||
|
.map(res => (res._1.id, res._2))(Encoders.tuple(Encoders.STRING, Encoders.kryo(classOf[ScholixResource])))
|
||||||
|
|
||||||
|
scholix_one_verse
|
||||||
|
.joinWith(resourceTarget, scholix_one_verse("_1") === resourceTarget("_1"), "inner")
|
||||||
|
.map(k => ScholexplorerUtils.updateTarget(k._1._2, k._2._2))
|
||||||
|
.write
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.text(s"$outputPath/scholix")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object SparkCreateScholexplorerDump {
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(SparkCreateScholexplorerDump.getClass)
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
new SparkCreateScholexplorerDump(
|
||||||
|
log = logger,
|
||||||
|
args = args,
|
||||||
|
propertyPath = "/eu/dnetlib/dhp/sx/create_scholix_dump_params.json"
|
||||||
|
).initialize().run()
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph.scholix
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
|
||||||
|
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
|
||||||
|
import org.apache.spark.SparkConf
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
|
import org.junit.jupiter.api.Test
|
||||||
|
import org.objenesis.strategy.StdInstantiatorStrategy
|
||||||
|
|
||||||
|
class ScholixGenerationTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def generateScholix(): Unit = {
|
||||||
|
|
||||||
|
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
|
||||||
|
val app = new SparkCreateScholexplorerDump(null, null, null)
|
||||||
|
// app.generateScholixResource("/home/sandro/Downloads/scholix_sample/", "/home/sandro/Downloads/scholix/", spark)
|
||||||
|
// app.generateBidirectionalRelations(
|
||||||
|
// "/home/sandro/Downloads/scholix_sample/",
|
||||||
|
// "/home/sandro/Downloads/scholix/",
|
||||||
|
// spark
|
||||||
|
// )
|
||||||
|
app.generateScholix("/home/sandro/Downloads/scholix/", spark)
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,7 +18,7 @@
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
<phase>initialize</phase>
|
<phase>process-resources</phase>
|
||||||
<goals>
|
<goals>
|
||||||
<goal>add-source</goal>
|
<goal>add-source</goal>
|
||||||
<goal>compile</goal>
|
<goal>compile</goal>
|
||||||
|
@ -59,12 +59,6 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
<exclusions>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>org.slf4j</groupId>
|
|
||||||
<artifactId>slf4j-api</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
</exclusions>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>dom4j</groupId>
|
<groupId>dom4j</groupId>
|
||||||
|
@ -160,6 +154,26 @@
|
||||||
<groupId>org.apache.zookeeper</groupId>
|
<groupId>org.apache.zookeeper</groupId>
|
||||||
<artifactId>zookeeper</artifactId>
|
<artifactId>zookeeper</artifactId>
|
||||||
</exclusion>
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>ant</artifactId>
|
||||||
|
<groupId>org.apache.ant</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>antlr4-runtime</artifactId>
|
||||||
|
<groupId>org.antlr</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>woodstox-core</artifactId>
|
||||||
|
<groupId>com.fasterxml.woodstox</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<artifactId>log4j</artifactId>
|
||||||
|
<groupId>*</groupId>
|
||||||
|
</exclusion>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>org.apache.logging.log4j</groupId>
|
||||||
|
<artifactId>*</artifactId>
|
||||||
|
</exclusion>
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -206,5 +220,90 @@
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
<profiles>
|
||||||
|
<profile>
|
||||||
|
<id>spark-24</id>
|
||||||
|
<activation>
|
||||||
|
<activeByDefault>true</activeByDefault>
|
||||||
|
</activation>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-3</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-34</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-4</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
|
||||||
|
<profile>
|
||||||
|
<id>spark-35</id>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>build-helper-maven-plugin</artifactId>
|
||||||
|
<version>3.4.0</version>
|
||||||
|
<executions>
|
||||||
|
<execution>
|
||||||
|
<phase>generate-sources</phase>
|
||||||
|
<goals>
|
||||||
|
<goal>add-source</goal>
|
||||||
|
</goals>
|
||||||
|
<configuration>
|
||||||
|
<sources>
|
||||||
|
<source>src/main/sparksolr-4</source>
|
||||||
|
</sources>
|
||||||
|
</configuration>
|
||||||
|
</execution>
|
||||||
|
</executions>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
</profile>
|
||||||
|
</profiles>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -31,7 +31,6 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.provision.XmlConverterJob;
|
|
||||||
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
import eu.dnetlib.dhp.oa.provision.model.TupleWrapper;
|
||||||
|
|
||||||
|
@ -48,7 +47,7 @@ public class IrishOaiExporterJob {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
XmlConverterJob.class
|
IrishOaiExporterJob.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/oaipmh/input_params_irish_oai_exporter.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
|
|
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
result
|
result
|
||||||
.getTitle()
|
.getTitle()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(t -> StringUtils.isNotBlank(t.getValue()))
|
||||||
.findFirst()
|
.findFirst()
|
||||||
.map(StructuredProperty::getValue)
|
|
||||||
.ifPresent(
|
.ifPresent(
|
||||||
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
|
title -> {
|
||||||
|
re.setTitle(title);
|
||||||
|
re
|
||||||
|
.getTitle()
|
||||||
|
.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||||
|
});
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
||||||
result
|
result
|
||||||
|
|
|
@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
||||||
import static org.apache.spark.sql.functions.*;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.SparkContext;
|
import org.apache.spark.SparkContext;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.expressions.UserDefinedFunction;
|
|
||||||
import org.apache.spark.sql.types.DataTypes;
|
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -45,9 +37,9 @@ import scala.Tuple2;
|
||||||
/**
|
/**
|
||||||
* XmlConverterJob converts the JoinedEntities as XML records
|
* XmlConverterJob converts the JoinedEntities as XML records
|
||||||
*/
|
*/
|
||||||
public class XmlConverterJob {
|
public class PayloadConverterJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
|
private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);
|
||||||
|
|
||||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||||
|
|
||||||
|
@ -56,8 +48,8 @@ public class XmlConverterJob {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
XmlConverterJob.class
|
PayloadConverterJob.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final Boolean isSparkSessionManaged = Optional
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
@ -72,6 +64,12 @@ public class XmlConverterJob {
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
final Boolean validateXML = Optional
|
||||||
|
.ofNullable(parser.get("validateXML"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.FALSE);
|
||||||
|
log.info("validateXML: {}", validateXML);
|
||||||
|
|
||||||
final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
|
final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
|
||||||
log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
|
log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
|
||||||
|
|
||||||
|
@ -86,18 +84,19 @@ public class XmlConverterJob {
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
removeOutputDir(spark, outputPath);
|
removeOutputDir(spark, outputPath);
|
||||||
convertToXml(
|
createPayloads(
|
||||||
spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
|
spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
|
||||||
VocabularyGroup.loadVocsFromIS(isLookup));
|
VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void convertToXml(
|
private static void createPayloads(
|
||||||
final SparkSession spark,
|
final SparkSession spark,
|
||||||
final String inputPath,
|
final String inputPath,
|
||||||
final String outputPath,
|
final String outputPath,
|
||||||
final ContextMapper contextMapper,
|
final ContextMapper contextMapper,
|
||||||
final VocabularyGroup vocabularies) {
|
final VocabularyGroup vocabularies,
|
||||||
|
final Boolean validateXML) {
|
||||||
|
|
||||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(
|
final XmlRecordFactory recordFactory = new XmlRecordFactory(
|
||||||
prepareAccumulators(spark.sparkContext()),
|
prepareAccumulators(spark.sparkContext()),
|
||||||
|
@ -118,7 +117,7 @@ public class XmlConverterJob {
|
||||||
.as(Encoders.kryo(JoinedEntity.class))
|
.as(Encoders.kryo(JoinedEntity.class))
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
|
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
|
||||||
recordFactory.build(je),
|
recordFactory.build(je, validateXML),
|
||||||
ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
|
ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
|
||||||
.map(
|
.map(
|
|
@ -2,42 +2,34 @@
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.oa.provision;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
import static org.apache.spark.sql.functions.col;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.PriorityQueue;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
|
||||||
import org.apache.spark.api.java.function.Function;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Encoder;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.sql.expressions.Aggregator;
|
import org.apache.spark.sql.expressions.Window;
|
||||||
|
import org.apache.spark.sql.expressions.WindowSpec;
|
||||||
|
import org.apache.spark.sql.functions;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
|
||||||
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
|
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
|
||||||
|
@ -130,132 +122,36 @@ public class PrepareRelationsJob {
|
||||||
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
|
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
|
||||||
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
|
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
|
||||||
|
|
||||||
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
|
WindowSpec source_w = Window
|
||||||
.filter(rel -> !(rel.getSource().startsWith("unresolved") || rel.getTarget().startsWith("unresolved")))
|
.partitionBy("source", "subRelType")
|
||||||
.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
|
.orderBy(col("target").desc_nulls_last());
|
||||||
.filter(rel -> !relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())));
|
|
||||||
|
|
||||||
JavaRDD<Relation> pruned = pruneRels(
|
WindowSpec target_w = Window
|
||||||
pruneRels(
|
.partitionBy("target", "subRelType")
|
||||||
rels,
|
.orderBy(col("source").desc_nulls_last());
|
||||||
sourceMaxRelations, relPartitions, (Function<Relation, String>) Relation::getSource),
|
|
||||||
targetMaxRelations, relPartitions, (Function<Relation, String>) Relation::getTarget);
|
|
||||||
spark
|
|
||||||
.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
|
|
||||||
.repartition(relPartitions)
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.parquet(outputPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, int maxRelations,
|
|
||||||
int relPartitions, Function<Relation, String> idFn) {
|
|
||||||
return rels
|
|
||||||
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
|
|
||||||
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
|
||||||
.groupBy(Tuple2::_1)
|
|
||||||
.map(Tuple2::_2)
|
|
||||||
.map(t -> Iterables.limit(t, maxRelations))
|
|
||||||
.flatMap(Iterable::iterator)
|
|
||||||
.map(Tuple2::_2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// experimental
|
|
||||||
private static void prepareRelationsDataset(
|
|
||||||
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
|
|
||||||
int relPartitions) {
|
|
||||||
spark
|
spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(inputRelationsPath)
|
.schema(Encoders.bean(Relation.class).schema())
|
||||||
.repartition(relPartitions)
|
.json(inputRelationsPath)
|
||||||
.map(
|
.where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
|
||||||
(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
|
.where("datainfo.deletedbyinference != true")
|
||||||
Encoders.kryo(Relation.class))
|
.where(
|
||||||
.filter((FilterFunction<Relation>) rel -> !rel.getDataInfo().getDeletedbyinference())
|
relationFilter.isEmpty() ? ""
|
||||||
.filter((FilterFunction<Relation>) rel -> !relationFilter.contains(rel.getRelClass()))
|
: "lower(relClass) NOT IN ("
|
||||||
.groupByKey(
|
+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
|
||||||
(MapFunction<Relation, String>) Relation::getSource,
|
.withColumn("source_w_pos", functions.row_number().over(source_w))
|
||||||
Encoders.STRING())
|
.where("source_w_pos < " + sourceMaxRelations)
|
||||||
.agg(new RelationAggregator(maxRelations).toColumn())
|
.drop("source_w_pos")
|
||||||
.flatMap(
|
.withColumn("target_w_pos", functions.row_number().over(target_w))
|
||||||
(FlatMapFunction<Tuple2<String, RelationList>, Relation>) t -> Iterables
|
.where("target_w_pos < " + targetMaxRelations)
|
||||||
.limit(t._2().getRelations(), maxRelations)
|
.drop("target_w_pos")
|
||||||
.iterator(),
|
.coalesce(relPartitions)
|
||||||
Encoders.bean(Relation.class))
|
|
||||||
.repartition(relPartitions)
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.parquet(outputPath);
|
.parquet(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class RelationAggregator
|
|
||||||
extends Aggregator<Relation, RelationList, RelationList> {
|
|
||||||
|
|
||||||
private final int maxRelations;
|
|
||||||
|
|
||||||
public RelationAggregator(int maxRelations) {
|
|
||||||
this.maxRelations = maxRelations;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList zero() {
|
|
||||||
return new RelationList();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList reduce(RelationList b, Relation a) {
|
|
||||||
b.getRelations().add(a);
|
|
||||||
return getSortableRelationList(b);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList merge(RelationList b1, RelationList b2) {
|
|
||||||
b1.getRelations().addAll(b2.getRelations());
|
|
||||||
return getSortableRelationList(b1);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public RelationList finish(RelationList r) {
|
|
||||||
return getSortableRelationList(r);
|
|
||||||
}
|
|
||||||
|
|
||||||
private RelationList getSortableRelationList(RelationList b1) {
|
|
||||||
RelationList sr = new RelationList();
|
|
||||||
sr
|
|
||||||
.setRelations(
|
|
||||||
b1
|
|
||||||
.getRelations()
|
|
||||||
.stream()
|
|
||||||
.limit(maxRelations)
|
|
||||||
.collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator()))));
|
|
||||||
return sr;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Encoder<RelationList> bufferEncoder() {
|
|
||||||
return Encoders.kryo(RelationList.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Encoder<RelationList> outputEncoder() {
|
|
||||||
return Encoders.kryo(RelationList.class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
|
|
||||||
* file,
|
|
||||||
*
|
|
||||||
* @param spark
|
|
||||||
* @param inputPath
|
|
||||||
* @return the JavaRDD<SortableRelation> containing all the relationships
|
|
||||||
*/
|
|
||||||
private static JavaRDD<Relation> readPathRelationRDD(
|
|
||||||
SparkSession spark, final String inputPath) {
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void removeOutputDir(SparkSession spark, String path) {
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,4 +14,7 @@ public class ProvisionConstants {
|
||||||
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final String PUBLIC_ALIAS_NAME = "public";
|
||||||
|
public static final String SHADOW_ALIAS_NAME = "shadow";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import com.google.common.collect.ComparisonChain;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
|
|
||||||
public class RelationComparator implements Comparator<Relation> {
|
|
||||||
|
|
||||||
private static final Map<String, Integer> weights = Maps.newHashMap();
|
|
||||||
|
|
||||||
static {
|
|
||||||
weights.put(ModelConstants.OUTCOME, 0);
|
|
||||||
weights.put(ModelConstants.SUPPLEMENT, 1);
|
|
||||||
weights.put(ModelConstants.REVIEW, 2);
|
|
||||||
weights.put(ModelConstants.CITATION, 3);
|
|
||||||
weights.put(ModelConstants.AFFILIATION, 4);
|
|
||||||
weights.put(ModelConstants.RELATIONSHIP, 5);
|
|
||||||
weights.put(ModelConstants.PUBLICATION_DATASET, 6);
|
|
||||||
weights.put(ModelConstants.SIMILARITY, 7);
|
|
||||||
|
|
||||||
weights.put(ModelConstants.PROVISION, 8);
|
|
||||||
weights.put(ModelConstants.PARTICIPATION, 9);
|
|
||||||
weights.put(ModelConstants.DEDUP, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Integer getWeight(Relation o) {
|
|
||||||
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(Relation o1, Relation o2) {
|
|
||||||
return ComparisonChain
|
|
||||||
.start()
|
|
||||||
.compare(getWeight(o1), getWeight(o2))
|
|
||||||
.result();
|
|
||||||
}
|
|
||||||
}
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue