1
0
Fork 0

Merge branch 'master' into dhp_oaf_model

This commit is contained in:
Claudio Atzori 2020-06-16 15:43:59 +02:00
commit 603b1bd0bb
235 changed files with 16031 additions and 1362 deletions

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
<packaging>jar</packaging>

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.common;
import java.io.Serializable;
import java.util.function.Consumer;
import java.util.function.Supplier;
/** Provides serializable and throwing extensions to standard functional interfaces. */
@ -10,6 +11,16 @@ public class FunctionalInterfaceSupport {
private FunctionalInterfaceSupport() {
}
/**
* Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
* functions externally.
*
* @param <T>
*/
@FunctionalInterface
public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
}
/**
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
* functions externally.

View File

@ -16,6 +16,12 @@ import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
/**
* PacePerson tries to derive information from the fullname string of an author. Such informations are Names, Surnames
* an Fullname split into terms. It provides also an additional field for the original data. The calculation of the
* names and the surnames is not always possible. When it is impossible to assert which are the names and the surnames,
* the lists are empty.
*/
public class PacePerson {
private static final String UTF8 = "UTF-8";
@ -26,10 +32,19 @@ public class PacePerson {
private static Set<String> particles = null;
/**
* Capitalizes a string
*
* @param s the string to capitalize
* @return the input string with capital letter
*/
public static final String capitalize(final String s) {
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
/**
* Adds a dot to a string with length equals to 1
*/
public static final String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
@ -46,6 +61,12 @@ public class PacePerson {
return h;
}
/**
* The constructor of the class. It fills the fields of the class basing on the input fullname.
*
* @param s the input string (fullname of the author)
* @param aggressive set the string normalization type
*/
public PacePerson(String s, final boolean aggressive) {
original = s;
s = Normalizer.normalize(s, Normalizer.Form.NFD);
@ -64,6 +85,7 @@ public class PacePerson {
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
// if the string contains a comma, it can derive surname and name by splitting on it
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
@ -74,21 +96,23 @@ public class PacePerson {
fullname.addAll(surname);
fullname.addAll(name);
}
} else {
} else { // otherwise, it should rely on CAPS terms and short terms
fullname = splitTerms(s);
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
// computes lastInitialPosition and hasSurnameInUpperCase
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
lastInitialPosition = i; // first word in the name longer than 1 (to avoid name with dots)
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
hasSurnameInUpperCase = true; // if one of the words is CAPS
}
}
// manages particular cases of fullnames
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
name = fullname.subList(0, lastInitialPosition + 1);
surname = fullname.subList(lastInitialPosition + 1, fullname.size());

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp.common;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
public class PacePersonTest {
@Test
public void pacePersonTest1() {
PacePerson p = new PacePerson("Artini, Michele", false);
assertEquals("Artini", p.getSurnameString());
assertEquals("Michele", p.getNameString());
assertEquals("Artini, Michele", p.getNormalisedFullname());
}
@Test
public void pacePersonTest2() {
PacePerson p = new PacePerson("Michele G. Artini", false);
assertEquals("Artini, Michele G.", p.getNormalisedFullname());
assertEquals("Michele G", p.getNameString());
assertEquals("Artini", p.getSurnameString());
}
}

View File

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>

View File

@ -14,6 +14,7 @@ public class ModelConstants {
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
public static final String DNET_COUNTRY_TYPE = "dnet:countries";
public static final String DNET_REVIEW_LEVELS = "dnet:review_levels";
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
@ -25,6 +26,10 @@ public class ModelConstants {
public static final String ORP_RESULTTYPE_CLASSID = "other";
public static final String RESULT_RESULT = "resultResult";
/**
* @deprecated Use {@link ModelConstants#RELATIONSHIP} instead.
*/
@Deprecated
public static final String PUBLICATION_DATASET = "publicationDataset";
public static final String IS_RELATED_TO = "isRelatedTo";
public static final String SUPPLEMENT = "supplement";
@ -34,6 +39,12 @@ public class ModelConstants {
public static final String IS_PART_OF = "IsPartOf";
public static final String HAS_PARTS = "HasParts";
public static final String RELATIONSHIP = "relationship";
public static final String CITATION = "citation";
public static final String CITES = "cites";
public static final String IS_CITED_BY = "IsCitedBy";
public static final String REVIEW = "review";
public static final String REVIEWS = "reviews";
public static final String IS_REVIEWED_BY = "IsReviewedBy";
public static final String RESULT_PROJECT = "resultProject";
public static final String OUTCOME = "outcome";

View File

@ -58,6 +58,18 @@ public class ModelSupport {
oafTypes.put("relation", Relation.class);
}
public static final Map<Class, String> idPrefixMap = Maps.newHashMap();
static {
idPrefixMap.put(Datasource.class, "10");
idPrefixMap.put(Organization.class, "20");
idPrefixMap.put(Project.class, "40");
idPrefixMap.put(Dataset.class, "50");
idPrefixMap.put(OtherResearchProduct.class, "50");
idPrefixMap.put(Software.class, "50");
idPrefixMap.put(Publication.class, "50");
}
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
static {
@ -289,6 +301,10 @@ public class ModelSupport {
private ModelSupport() {
}
public static <E extends OafEntity> String getIdPrefix(Class<E> clazz) {
return idPrefixMap.get(clazz);
}
/**
* Checks subclass-superclass relationship.
*

View File

@ -10,6 +10,7 @@ public class Dataset extends Result implements Serializable {
private Field<String> storagedate;
// candidate for removal
private Field<String> device;
private Field<String> size;

View File

@ -31,7 +31,7 @@ public class Instance implements Serializable {
// typed results
private Field<String> processingchargecurrency;
private Field<String> refereed; // peer-review status
private Qualifier refereed; // peer-review status
public Field<String> getLicense() {
return license;
@ -113,11 +113,11 @@ public class Instance implements Serializable {
this.processingchargecurrency = processingchargecurrency;
}
public Field<String> getRefereed() {
public Qualifier getRefereed() {
return refereed;
}
public void setRefereed(Field<String> refereed) {
public void setRefereed(Qualifier refereed) {
this.refereed = refereed;
}

View File

@ -2,8 +2,10 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
public class Result extends OafEntity implements Serializable {
@ -260,21 +262,29 @@ public class Result extends OafEntity implements Serializable {
StructuredProperty baseMainTitle = null;
if (title != null) {
baseMainTitle = getMainTitle(title);
title.remove(baseMainTitle);
if (baseMainTitle != null) {
final StructuredProperty p = baseMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
}
StructuredProperty newMainTitle = null;
if (r.getTitle() != null) {
newMainTitle = getMainTitle(r.getTitle());
r.getTitle().remove(newMainTitle);
if (newMainTitle != null && title != null) {
final StructuredProperty p = newMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
}
}
if (newMainTitle != null && compareTrust(this, r) < 0)
if (newMainTitle != null && compareTrust(this, r) < 0) {
baseMainTitle = newMainTitle;
}
title = mergeLists(title, r.getTitle());
if (title != null && baseMainTitle != null)
if (title != null && baseMainTitle != null) {
title.add(baseMainTitle);
}
relevantdate = mergeLists(relevantdate, r.getRelevantdate());

View File

@ -10,8 +10,10 @@ public class Software extends Result implements Serializable {
private List<Field<String>> documentationUrl;
// candidate for removal
private List<StructuredProperty> license;
// candidate for removal
private Field<String> codeRepositoryUrl;
private Qualifier programmingLanguage;

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<artifactId>dhp-actionmanager</artifactId>

View File

@ -96,12 +96,21 @@ public class ProtoConverter implements Serializable {
.stream()
.distinct()
.collect(Collectors.toCollection(ArrayList::new)) : null);
i.setRefereed(mapStringField(ri.getRefereed()));
i.setRefereed(mapRefereed(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i;
}
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
Qualifier q = new Qualifier();
q.setClassid(refereed.getValue());
q.setSchemename(refereed.getValue());
q.setSchemeid("dnet:review_levels");
q.setSchemename("dnet:review_levels");
return q;
}
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getExternalReferenceCount() > 0) {

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>

View File

@ -8,6 +8,7 @@ import java.io.File;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
@ -19,6 +20,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
@Disabled
public class DnetCollectorWorkerApplicationTests {
private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class);

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -1,66 +1,70 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-broker-events</artifactId>
<artifactId>dhp-broker-events</artifactId>
<dependencies>
<dependencies>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-broker-common</artifactId>
<version>[2.0.0,3.0.0)</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-broker-common</artifactId>
<version>[3.0.1,4.0.0)</version>
</dependency>
</dependencies>
</dependencies>
</project>

View File

@ -0,0 +1,37 @@
<?xml version='1.0' encoding='UTF-8'?>
<dfxml xmloutputversion='1.0'>
<metadata
xmlns='http://afflib.org/tcpflow/'
xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
xmlns:dc='http://purl.org/dc/elements/1.1/'>
<dc:type>Feature Extraction</dc:type>
</metadata>
<creator version='1.0'>
<program>TCPFLOW</program>
<version>1.5.0</version>
<build_environment>
<compiler>4.2.1 (4.2.1 Compatible Apple LLVM 11.0.0 (clang-1100.0.33.8))</compiler>
<CPPFLAGS>-D_THREAD_SAFE -pthread -I/usr/local/include -I/usr/local/include -DUTC_OFFSET=+0000 </CPPFLAGS>
<CFLAGS>-g -D_THREAD_SAFE -pthread -g -O3 -MD -Wpointer-arith -Wmissing-declarations -Wmissing-prototypes -Wshadow -Wwrite-strings -Wcast-align -Waggregate-return -Wbad-function-cast -Wcast-qual -Wundef -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wc++-compat -Wmissing-noreturn -Wall -Wstrict-prototypes -MD -D_FORTIFY_SOURCE=2 -Wpointer-arith -Wmissing-declarations -Wmissing-prototypes -Wshadow -Wwrite-strings -Wcast-align -Waggregate-return -Wbad-function-cast -Wcast-qual -Wundef -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wc++-compat -Wmissing-noreturn -Wall -Wstrict-prototypes</CFLAGS>
<CXXFLAGS>-g -D_THREAD_SAFE -pthread -g -O3 -Wall -MD -D_FORTIFY_SOURCE=2 -Wpointer-arith -Wshadow -Wwrite-strings -Wcast-align -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wmissing-noreturn -Woverloaded-virtual -Wsign-promo -funit-at-a-time -Weffc++ -std=c++11 -Wall -MD -D_FORTIFY_SOURCE=2 -Wpointer-arith -Wshadow -Wwrite-strings -Wcast-align -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wmissing-noreturn -Woverloaded-virtual -Wsign-promo -funit-at-a-time -Weffc++ </CXXFLAGS>
<LDFLAGS>-L/usr/local/lib -L/usr/local/lib </LDFLAGS>
<LIBS>-lpython2.7 -lpython2.7 -lpcap -lbz2 -lexpat -lsqlite3 -lcrypto -lssl -lcrypto -ldl -lz </LIBS>
<compilation_date>2019-10-11T01:16:58</compilation_date>
<library name="boost" version="107100"/>
<library name="sqlite" version="3.28.0" source_id="2019-04-15 14:49:49 378230ae7f4b721c8b8d83c8ceb891449685cd23b1702a57841f1be40b5daapl"/>
</build_environment>
<execution_environment>
<os_sysname>Darwin</os_sysname>
<os_release>19.5.0</os_release>
<os_version>Darwin Kernel Version 19.5.0: Tue May 26 20:41:44 PDT 2020; root:xnu-6153.121.2~2/RELEASE_X86_64</os_version>
<host>Micheles-MBP.local</host>
<arch>x86_64</arch>
<command_line>tcpflow</command_line>
<uid>501</uid>
<username>michele</username>
<start_time>2020-06-15T14:55:03Z</start_time>
</execution_environment>
</creator>
<configuration>
</configuration>
<tdelta>0</tdelta>

View File

@ -1,9 +1,15 @@
package eu.dnetlib.dhp.broker.model;
import java.io.Serializable;
import java.util.Map;
public class Event {
public class Event implements Serializable {
/**
*
*/
private static final long serialVersionUID = -5936790326505781395L;
private String eventId;

View File

@ -6,18 +6,13 @@ import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import eu.dnetlib.broker.objects.OpenAireEventPayload;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class EventFactory {
@ -37,15 +32,12 @@ public class EventFactory {
final Map<String, Object> map = createMapFromResult(updateInfo);
final String payload = createPayload(updateInfo);
final String eventId = calculateEventId(
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0),
updateInfo.getHighlightValueAsString());
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId(), updateInfo.getHighlightValueAsString());
res.setEventId(eventId);
res.setProducerId(PRODUCER_ID);
res.setPayload(payload);
res.setPayload(updateInfo.asBrokerPayload().toJSON());
res.setMap(map);
res.setTopic(updateInfo.getTopicPath());
res.setCreationDate(now);
@ -54,65 +46,34 @@ public class EventFactory {
return res;
}
private static String createPayload(final UpdateInfo<?> updateInfo) {
final OpenAireEventPayload payload = new OpenAireEventPayload();
// TODO
updateInfo.compileHighlight(payload);
return payload.toJSON();
}
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) {
final Map<String, Object> map = new HashMap<>();
final Result source = updateInfo.getSource();
final Result target = updateInfo.getTarget();
final OpenaireBrokerResult source = updateInfo.getSource();
final OpenaireBrokerResult target = updateInfo.getTarget();
final List<KeyValue> collectedFrom = target.getCollectedfrom();
if (collectedFrom.size() == 1) {
map.put("target_datasource_id", collectedFrom.get(0).getKey());
map.put("target_datasource_name", collectedFrom.get(0).getValue());
}
map.put("target_datasource_id", target.getCollectedFromId());
map.put("target_datasource_name", target.getCollectedFromName());
final List<String> ids = target.getOriginalId();
if (ids.size() > 0) {
map.put("target_publication_id", ids.get(0));
}
map.put("target_publication_id", target.getOriginalId());
final List<StructuredProperty> titles = target.getTitle();
final List<String> titles = target.getTitles();
if (titles.size() > 0) {
map.put("target_publication_title", titles.get(0));
}
final long date = parseDateTolong(target.getDateofacceptance().getValue());
final long date = parseDateTolong(target.getPublicationdate());
if (date > 0) {
map.put("target_dateofacceptance", date);
}
final List<StructuredProperty> subjects = target.getSubject();
if (subjects.size() > 0) {
map
.put(
"target_publication_subject_list",
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
}
final List<Author> authors = target.getAuthor();
if (authors.size() > 0) {
map
.put(
"target_publication_author_list",
authors.stream().map(Author::getFullname).collect(Collectors.toList()));
}
map.put("target_publication_subject_list", target.getSubjects());
map.put("target_publication_author_list", target.getCreators());
// PROVENANCE INFO
map.put("trust", updateInfo.getTrust());
final List<KeyValue> sourceCollectedFrom = source.getCollectedfrom();
if (sourceCollectedFrom.size() == 1) {
map.put("provenance_datasource_id", sourceCollectedFrom.get(0).getKey());
map.put("provenance_datasource_name", sourceCollectedFrom.get(0).getValue());
}
map.put("provenance_datasource_id", source.getCollectedFromId());
map.put("provenance_datasource_name", source.getCollectedFromName());
map.put("provenance_publication_id_list", source.getOriginalId());
return map;

View File

@ -3,59 +3,33 @@ package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.TypedColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.model.EventFactory;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
@ -63,49 +37,23 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import scala.Tuple2;
public class GenerateEventsApplication {
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
// Simple Matchers
private static final UpdateMatcher<Result, ?> enrichMissingAbstract = new EnrichMissingAbstract();
private static final UpdateMatcher<Result, ?> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
private static final UpdateMatcher<Result, ?> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
private static final UpdateMatcher<Result, ?> enrichMissingPid = new EnrichMissingPid();
private static final UpdateMatcher<Result, ?> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
private static final UpdateMatcher<Result, ?> enrichMissingSubject = new EnrichMissingSubject();
private static final UpdateMatcher<Result, ?> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
private static final UpdateMatcher<Result, ?> enrichMorePid = new EnrichMorePid();
private static final UpdateMatcher<Result, ?> enrichMoreSubject = new EnrichMoreSubject();
// Advanced matchers
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMissingProject = new EnrichMissingProject();
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMoreProject = new EnrichMoreProject();
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMissingSoftware = new EnrichMissingSoftware();
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo();
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo();
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy();
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateEventsApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
@ -120,20 +68,32 @@ public class GenerateEventsApplication {
final String eventsPath = parser.get("eventsPath");
log.info("eventsPath: {}", eventsPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String dedupConfigProfileId = parser.get("dedupConfProfile");
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
final SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(BrokerConstants.getModelClasses());
final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
removeOutputDir(spark, eventsPath);
final JavaRDD<Event> eventsRdd = sc.emptyRDD();
eventsRdd.union(generateSimpleEvents(spark, graphPath, Publication.class));
eventsRdd.union(generateSimpleEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class));
eventsRdd.union(generateSimpleEvents(spark, graphPath, Software.class));
eventsRdd.union(generateSimpleEvents(spark, graphPath, OtherResearchProduct.class));
eventsRdd.saveAsTextFile(eventsPath, GzipCodec.class);
spark
.emptyDataset(Encoders.kryo(Event.class))
.union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
.union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
.union(generateEvents(spark, graphPath, Software.class, dedupConfig))
.union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(eventsPath);
});
}
@ -142,130 +102,85 @@ public class GenerateEventsApplication {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static <R extends Result> JavaRDD<Event> generateSimpleEvents(final SparkSession spark,
private static <SRC extends Result> Dataset<Event> generateEvents(
final SparkSession spark,
final String graphPath,
final Class<R> resultClazz) {
final Class<SRC> sourceClass,
final DedupConfig dedupConfig) {
final Dataset<R> results = readPath(
spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
.filter(r -> r.getDataInfo().getDeletedbyinference());
final Dataset<OpenaireBrokerResult> results = expandResultsWithRelations(spark, graphPath, sourceClass);
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final Column c = null; // TODO
final TypedColumn<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup> aggr = new ResultAggregator()
.toColumn();
final Dataset<Row> aa = results
.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner")
.groupBy(rels.col("target"))
.agg(c)
.filter(x -> x.size() > 1)
// generateSimpleEvents(...)
// flatMap()
// toRdd()
return results
.joinWith(mergedRels, results.col("result.id").equalTo(mergedRels.col("source")), "inner")
.groupByKey(
(MapFunction<Tuple2<OpenaireBrokerResult, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
.agg(aggr)
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
.filter(ResultGroup::isValid)
.map(
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
Encoders.kryo(EventGroup.class))
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
}
private static <SRC extends Result> Dataset<OpenaireBrokerResult> expandResultsWithRelations(
final SparkSession spark,
final String graphPath,
final Class<SRC> sourceClass) {
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final Dataset<OpenaireBrokerResult> r0 = readPath(
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.map(ConversionUtils::oafResultToBrokerResult, Encoders.kryo(OpenaireBrokerResult.class));
final Dataset<OpenaireBrokerResult> r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class));
final Dataset<OpenaireBrokerResult> r2 = join(r1, rels, relatedEntities(softwares, rels, RelatedProject.class));
final Dataset<OpenaireBrokerResult> r3 = join(r2, rels, relatedEntities(datasets, rels, RelatedProject.class));
final Dataset<OpenaireBrokerResult> r4 = join(
r3, rels, relatedEntities(publications, rels, RelatedProject.class));
;
return null;
return r4;
}
private List<Event> generateSimpleEvents(final Collection<Result> children) {
final List<UpdateInfo<?>> list = new ArrayList<>();
for (final Result target : children) {
list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children));
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children));
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children));
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children));
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children));
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children));
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children));
list.addAll(enrichMorePid.searchUpdatesForRecord(target, children));
list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children));
}
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets,
final Dataset<Relation> rels,
final Class<RT> clazz) {
return rels
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")
.map(
t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz),
Encoders.kryo(clazz));
}
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects) {
final List<UpdateInfo<?>> list = new ArrayList<>();
private static <T> Dataset<OpenaireBrokerResult> join(final Dataset<OpenaireBrokerResult> sources,
final Dataset<Relation> rels,
final Dataset<T> typedRels) {
for (final Pair<Result, List<Project>> target : childrenWithProjects) {
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects));
list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects));
}
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
}
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares) {
final List<UpdateInfo<?>> list = new ArrayList<>();
for (final Pair<Result, List<Software>> target : childrenWithSoftwares) {
list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
}
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
}
private List<Event> generatePublicationRelatedEvents(final String relType,
final Collection<Pair<Result, Map<String, List<Publication>>>> childrenWithRels) {
final List<UpdateInfo<?>> list = new ArrayList<>();
final List<Pair<Result, List<Publication>>> cleanedChildrens = childrenWithRels
.stream()
.filter(p -> p.getRight().containsKey(relType))
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
.filter(p -> p.getRight().size() > 0)
.collect(Collectors.toList());
for (final Pair<Result, List<Publication>> target : cleanedChildrens) {
if (relType.equals("isRelatedTo")) {
list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("references")) {
list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("isReferencedBy")) {
list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("isSupplementedTo")) {
list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("isSupplementedBy")) {
list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
}
}
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
}
private List<Event> generateDatasetRelatedEvents(final String relType,
final Collection<Pair<Result, Map<String, List<eu.dnetlib.dhp.schema.oaf.Dataset>>>> childrenWithRels) {
final List<UpdateInfo<?>> list = new ArrayList<>();
final List<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>> cleanedChildrens = childrenWithRels
.stream()
.filter(p -> p.getRight().containsKey(relType))
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
.filter(p -> p.getRight().size() > 0)
.collect(Collectors.toList());
for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) {
if (relType.equals("isRelatedTo")) {
list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("references")) {
list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("isReferencedBy")) {
list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("isSupplementedTo")) {
list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
} else if (relType.equals("isSupplementedBy")) {
list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
}
}
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
final TypedColumn<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator<T>()
.toColumn();
;
return sources
.joinWith(typedRels, sources.col("result.id").equalTo(rels.col("source")), "left_outer")
.groupByKey(
(MapFunction<Tuple2<OpenaireBrokerResult, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.kryo(OpenaireBrokerResult.class));
}
public static <R> Dataset<R> readPath(
@ -277,4 +192,23 @@ public class GenerateEventsApplication {
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String conf = isLookUpService
.getResourceProfileByQuery(
String
.format(
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
profId));
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
dedupConfig.getPace().initModel();
dedupConfig.getPace().initTranslationMap();
// dedupConfig.getWf().setConfigurationId("???");
return dedupConfig;
}
}

View File

@ -6,28 +6,45 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Function;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.pace.config.DedupConfig;
public abstract class UpdateMatcher<K, T> {
public abstract class UpdateMatcher<T> {
private final boolean multipleUpdate;
private final Function<T, Topic> topicFunction;
private final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction;
private final Function<T, String> highlightToStringFunction;
public UpdateMatcher(final boolean multipleUpdate) {
public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction,
final Function<T, String> highlightToStringFunction) {
this.multipleUpdate = multipleUpdate;
this.topicFunction = topicFunction;
this.compileHighlightFunction = compileHighlightFunction;
this.highlightToStringFunction = highlightToStringFunction;
}
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others) {
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OpenaireBrokerResult res,
final Collection<OpenaireBrokerResult> others,
final DedupConfig dedupConfig) {
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
for (final K source : others) {
for (final OpenaireBrokerResult source : others) {
if (source != res) {
for (final UpdateInfo<T> info : findUpdates(source, res)) {
for (final T hl : findDifferences(source, res)) {
final Topic topic = getTopicFunction().apply(hl);
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(),
getHighlightToStringFunction(), dedupConfig);
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
} else {
@ -51,18 +68,30 @@ public abstract class UpdateMatcher<K, T> {
}
}
protected abstract List<UpdateInfo<T>> findUpdates(K source, K target);
protected abstract List<T> findDifferences(OpenaireBrokerResult source, OpenaireBrokerResult target);
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue,
final K source,
final K target);
protected static boolean isMissing(final List<Field<String>> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
protected static boolean isMissing(final List<String> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
}
protected boolean isMissing(final Field<String> field) {
return field == null || StringUtils.isBlank(field.getValue());
protected boolean isMissing(final String field) {
return StringUtils.isBlank(field);
}
public boolean isMultipleUpdate() {
return multipleUpdate;
}
public Function<T, Topic> getTopicFunction() {
return topicFunction;
}
public BiConsumer<OpenaireBrokerResult, T> getCompileHighlightFunction() {
return compileHighlightFunction;
}
public Function<T, String> getHighlightToStringFunction() {
return highlightToStringFunction;
}
}

View File

@ -5,59 +5,41 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.broker.objects.Dataset;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result;
public abstract class AbstractEnrichMissingDataset
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
private final Topic topic;
extends UpdateMatcher<Dataset> {
public AbstractEnrichMissingDataset(final Topic topic) {
super(true);
this.topic = topic;
super(true,
rel -> topic,
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getOriginalId());
}
protected abstract boolean filterByType(String relType);
@Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
protected final List<Dataset> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingDatasets = target
.getRight()
.getDatasets()
.stream()
.map(Dataset::getId)
.filter(rel -> filterByType(rel.getRelType()))
.map(Dataset::getOriginalId)
.collect(Collectors.toSet());
return source
.getRight()
.getDatasets()
.stream()
.filter(d -> !existingDatasets.contains(d.getId()))
.map(ConversionUtils::oafDatasetToBrokerDataset)
.map(i -> generateUpdateInfo(i, source, target))
.filter(rel -> filterByType(rel.getRelType()))
.filter(d -> !existingDatasets.contains(d.getOriginalId()))
.collect(Collectors.toList());
}
@Override
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
return new UpdateInfo<>(
getTopic(),
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
public Topic getTopic() {
return topic;
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDat
super(Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isReferencedBy");
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDatase
super(Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isRelatedTo");
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingD
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedBy");
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingD
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedTo");
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset
super(Topic.ENRICH_MISSING_DATASET_REFERENCES);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("references");
}
}

View File

@ -1,41 +1,29 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Project;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingProject
extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
public class EnrichMissingProject extends UpdateMatcher<Project> {
public EnrichMissingProject() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) {
// TODO
return Arrays.asList();
}
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
final eu.dnetlib.broker.objects.Project highlightValue,
final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PROJECT,
highlightValue, source.getLeft(), target.getLeft(),
super(true,
prj -> Topic.ENRICH_MISSING_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
}
@Override
protected List<Project> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
if (target.getProjects().isEmpty()) {
return source.getProjects();
} else {
return new ArrayList<>();
}
}
}

View File

@ -1,40 +1,43 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Project;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
public class EnrichMoreProject extends UpdateMatcher<Project> {
public EnrichMoreProject() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) {
// TODO
return Arrays.asList();
}
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
final eu.dnetlib.broker.objects.Project highlightValue,
final Pair<Result, List<Project>> source,
final Pair<Result, List<Project>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_PROJECT,
highlightValue, source.getLeft(), target.getLeft(),
super(true,
prj -> Topic.ENRICH_MORE_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
prj -> projectAsString(prj));
}
private static String projectAsString(final Project prj) {
return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode();
}
@Override
protected List<eu.dnetlib.broker.objects.Project> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingProjects = target
.getProjects()
.stream()
.map(EnrichMoreProject::projectAsString)
.collect(Collectors.toSet());
return source
.getProjects()
.stream()
.filter(p -> !existingProjects.contains(projectAsString(p)))
.collect(Collectors.toList());
}
}

View File

@ -5,59 +5,41 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Publication;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public abstract class AbstractEnrichMissingPublication
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
private final Topic topic;
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<Publication> {
public AbstractEnrichMissingPublication(final Topic topic) {
super(true);
this.topic = topic;
super(true,
rel -> topic,
(p, rel) -> p.getPublications().add(rel),
rel -> rel.getOriginalId());
}
protected abstract boolean filterByType(String relType);
@Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
protected final List<eu.dnetlib.broker.objects.Publication> findDifferences(
final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingPublications = target
.getRight()
.getPublications()
.stream()
.map(Publication::getId)
.filter(rel -> filterByType(rel.getRelType()))
.map(Publication::getOriginalId)
.collect(Collectors.toSet());
return source
.getRight()
.getPublications()
.stream()
.filter(d -> !existingPublications.contains(d.getId()))
.map(ConversionUtils::oafPublicationToBrokerPublication)
.map(i -> generateUpdateInfo(i, source, target))
.filter(rel -> filterByType(rel.getRelType()))
.filter(p -> !existingPublications.contains(p.getOriginalId()))
.collect(Collectors.toList());
}
@Override
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
return new UpdateInfo<>(
getTopic(),
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getPublications().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
public Topic getTopic() {
return topic;
}
}

View File

@ -9,4 +9,8 @@ public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissin
super(Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isReferencedBy");
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPu
super(Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isRelatedTo");
}
}

View File

@ -9,4 +9,8 @@ public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMiss
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedBy");
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMiss
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("isSupplementedTo");
}
}

View File

@ -9,4 +9,9 @@ public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPub
super(Topic.ENRICH_MISSING_PUBLICATION_REFERENCES);
}
@Override
protected boolean filterByType(final String relType) {
return relType.equals("references");
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingSoftware
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
public EnrichMissingSoftware() {
super(true,
s -> Topic.ENRICH_MISSING_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
}
@Override
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
if (target.getSoftwares().isEmpty()) {
return source.getSoftwares();
} else {
return new ArrayList<>();
}
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Software;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMoreSoftware extends UpdateMatcher<Software> {
public EnrichMoreSoftware() {
super(true,
s -> Topic.ENRICH_MORE_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
}
@Override
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingSoftwares = source
.getSoftwares()
.stream()
.map(Software::getName)
.collect(Collectors.toSet());
return target
.getSoftwares()
.stream()
.filter(p -> !existingSoftwares.contains(p.getName()))
.collect(Collectors.toList());
}
}

View File

@ -5,34 +5,26 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
public class EnrichMissingAbstract extends UpdateMatcher<String> {
public EnrichMissingAbstract() {
super(false);
}
@Override
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target));
}
return new ArrayList<>();
}
@Override
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final Result source,
final Result target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_ABSTRACT,
highlightValue, source, target,
super(false,
s -> Topic.ENRICH_MISSING_ABSTRACT,
(p, s) -> p.getAbstracts().add(s),
s -> s);
}
@Override
protected List<String> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) {
return Arrays.asList(source.getAbstracts().get(0));
} else {
return new ArrayList<>();
}
}
}

View File

@ -1,36 +1,43 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.broker.objects.Author;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String, String>> {
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Author> {
public EnrichMissingAuthorOrcid() {
super(true);
super(true,
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
(p, aut) -> p.getCreators().add(aut),
aut -> aut.getOrcid());
}
@Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
return Arrays.asList();
}
protected List<Author> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingOrcids = target
.getCreators()
.stream()
.map(Author::getOrcid)
.filter(StringUtils::isNotBlank)
.collect(Collectors.toSet());
return source
.getCreators()
.stream()
.filter(a -> StringUtils.isNotBlank(a.getOrcid()))
.filter(a -> !existingOrcids.contains(a.getOrcid()))
.collect(Collectors.toList());
@Override
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final Result source,
final Result target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_AUTHOR_ORCID,
highlightValue, source, target,
(p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight());
}
}

View File

@ -6,25 +6,27 @@ import java.util.List;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
public EnrichMissingOpenAccess() {
super(true);
super(true,
i -> Topic.ENRICH_MISSING_OA_VERSION,
(p, i) -> p.getInstances().add(i),
Instance::getUrl);
}
@Override
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final long count = target
.getInstance()
.getInstances()
.stream()
.map(i -> i.getAccessright().getClassid())
.map(Instance::getLicense)
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
.count();
@ -33,24 +35,10 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
}
return source
.getInstance()
.getInstances()
.stream()
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(s -> s)
.map(i -> generateUpdateInfo(i, source, target))
.filter(i -> i.getLicense().equals(BrokerConstants.OPEN_ACCESS))
.collect(Collectors.toList());
}
@Override
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
final Result source,
final Result target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_OA_VERSION,
highlightValue, source, target,
(p, i) -> p.getInstances().add(i),
Instance::getUrl);
}
}

View File

@ -5,42 +5,33 @@ import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
public class EnrichMissingPid extends UpdateMatcher<TypedValue> {
public EnrichMissingPid() {
super(true);
super(true,
pid -> Topic.ENRICH_MISSING_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue());
}
@Override
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
final long count = target.getPid().size();
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final long count = target.getPids().size();
if (count > 0) {
return Arrays.asList();
}
return source
.getPid()
.getPids()
.stream()
.map(ConversionUtils::oafPidToBrokerPid)
.map(i -> generateUpdateInfo(i, source, target))
.collect(Collectors.toList());
}
@Override
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PID,
highlightValue, source, target,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue());
}
}

View File

@ -5,34 +5,28 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String> {
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
public EnrichMissingPublicationDate() {
super(false);
}
@Override
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) {
return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target));
}
return new ArrayList<>();
}
@Override
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final Result source,
final Result target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_DATE,
highlightValue, source, target,
super(false,
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
(p, date) -> p.setPublicationdate(date),
s -> s);
}
@Override
protected List<String> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
if (isMissing(target.getPublicationdate()) && !isMissing(source.getPublicationdate())) {
return Arrays.asList(source.getPublicationdate());
} else {
return new ArrayList<>();
}
}
}

View File

@ -1,42 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
public class EnrichMissingSoftware
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
public EnrichMissingSoftware() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) {
// TODO
return Arrays.asList();
}
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
final eu.dnetlib.broker.objects.Software highlightValue,
final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_SOFTWARE,
highlightValue, source.getLeft(), target.getLeft(),
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
}
}

View File

@ -5,50 +5,38 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, String>> {
public class EnrichMissingSubject extends UpdateMatcher<TypedValue> {
public EnrichMissingSubject() {
super(true);
super(true,
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));
}
@Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
final Set<String> existingTypes = target
.getSubject()
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingSubject = target
.getSubjects()
.stream()
.map(StructuredProperty::getQualifier)
.map(Qualifier::getClassid)
.map(s -> subjectAsString(s))
.collect(Collectors.toSet());
return source
.getPid()
.getSubjects()
.stream()
.filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
.map(ConversionUtils::oafSubjectToPair)
.map(i -> generateUpdateInfo(i, source, target))
.filter(s -> !existingSubject.contains(subjectAsString(s)))
.collect(Collectors.toList());
}
@Override
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final Result source,
final Result target) {
return new UpdateInfo<>(
Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
highlightValue, source, target,
(p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight());
private static String subjectAsString(final TypedValue s) {
return s.getType() + "::" + s.getValue();
}
}

View File

@ -6,49 +6,36 @@ import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
public EnrichMoreOpenAccess() {
super(true);
}
@Override
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
final Set<String> urls = target
.getInstance()
.stream()
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
.map(i -> i.getUrl())
.flatMap(List::stream)
.collect(Collectors.toSet());
return source
.getInstance()
.stream()
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(s -> s)
.filter(i -> !urls.contains(i.getUrl()))
.map(i -> generateUpdateInfo(i, source, target))
.collect(Collectors.toList());
}
@Override
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
final Result source,
final Result target) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_OA_VERSION,
highlightValue, source, target,
super(true,
i -> Topic.ENRICH_MORE_OA_VERSION,
(p, i) -> p.getInstances().add(i),
Instance::getUrl);
}
@Override
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> urls = target
.getInstances()
.stream()
.filter(i -> i.getLicense().equals(BrokerConstants.OPEN_ACCESS))
.map(i -> i.getUrl())
.collect(Collectors.toSet());
return source
.getInstances()
.stream()
.filter(i -> i.getLicense().equals(BrokerConstants.OPEN_ACCESS))
.filter(i -> !urls.contains(i.getUrl()))
.collect(Collectors.toList());
}
}

View File

@ -5,43 +5,37 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
public class EnrichMorePid extends UpdateMatcher<TypedValue> {
public EnrichMorePid() {
super(true);
super(true,
pid -> Topic.ENRICH_MORE_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pidAsString(pid));
}
@Override
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingPids = target
.getPid()
.getPids()
.stream()
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
.map(pid -> pidAsString(pid))
.collect(Collectors.toSet());
return source
.getPid()
.getPids()
.stream()
.filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
.map(ConversionUtils::oafPidToBrokerPid)
.map(i -> generateUpdateInfo(i, source, target))
.filter(pid -> !existingPids.contains(pidAsString(pid)))
.collect(Collectors.toList());
}
@Override
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_PID,
highlightValue, source, target,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue());
private static String pidAsString(final TypedValue pid) {
return pid.getType() + "::" + pid.getValue();
}
}

View File

@ -1,42 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
public class EnrichMoreSoftware
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
public EnrichMoreSoftware() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) {
// TODO
return Arrays.asList();
}
@Override
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
final eu.dnetlib.broker.objects.Software highlightValue,
final Pair<Result, List<Software>> source,
final Pair<Result, List<Software>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_SOFTWARE,
highlightValue, source.getLeft(), target.getLeft(),
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
}
}

View File

@ -5,47 +5,37 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String>> {
public class EnrichMoreSubject extends UpdateMatcher<TypedValue> {
public EnrichMoreSubject() {
super(true);
super(true,
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));
}
@Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
final Set<String> existingSubjects = target
.getSubject()
.getSubjects()
.stream()
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
.map(pid -> subjectAsString(pid))
.collect(Collectors.toSet());
return source
.getPid()
.getPids()
.stream()
.filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
.map(ConversionUtils::oafSubjectToPair)
.map(i -> generateUpdateInfo(i, source, target))
.filter(s -> !existingSubjects.contains(subjectAsString(s)))
.collect(Collectors.toList());
}
@Override
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final Result source,
final Result target) {
return new UpdateInfo<>(
Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
highlightValue, source, target,
(p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight());
private static String subjectAsString(final TypedValue s) {
return s.getType() + "::" + s.getValue();
}
}

View File

@ -1,9 +1,27 @@
package eu.dnetlib.dhp.broker.oa.util;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.dhp.schema.common.ModelSupport;
public class BrokerConstants {
public final static String OPEN_ACCESS = "OPEN";
public final static String IS_MERGED_IN_CLASS = "isMergedIn";
public static final String OPEN_ACCESS = "OPEN";
public static final String IS_MERGED_IN_CLASS = "isMergedIn";
public static final float MIN_TRUST = 0.25f;
public static final float MAX_TRUST = 1.00f;
public static Class<?>[] getModelClasses() {
final Set<Class<?>> list = new HashSet<>();
list.addAll(Arrays.asList(ModelSupport.getOafModelClasses()));
list.addAll(Arrays.asList(ResultGroup.class, Event.class));
return list.toArray(new Class[] {});
}
}

View File

@ -1,49 +1,213 @@
package eu.dnetlib.dhp.broker.oa.util;
import java.util.stream.Stream;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ConversionUtils {
public static Stream<Instance> oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) {
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
public static List<eu.dnetlib.broker.objects.Instance> oafInstanceToBrokerInstances(final Instance i) {
return i.getUrl().stream().map(url -> {
final Instance r = new Instance();
r.setUrl(url);
r.setInstancetype(i.getInstancetype().getClassid());
r.setLicense(BrokerConstants.OPEN_ACCESS);
r.setHostedby(i.getHostedby().getValue());
return r;
});
return new eu.dnetlib.broker.objects.Instance()
.setUrl(url)
.setInstancetype(i.getInstancetype().getClassid())
.setLicense(BrokerConstants.OPEN_ACCESS)
.setHostedby(i.getHostedby().getValue());
}).collect(Collectors.toList());
}
public static Pid oafPidToBrokerPid(final StructuredProperty sp) {
final Pid pid = new Pid();
pid.setValue(sp.getValue());
pid.setType(sp.getQualifier().getClassid());
return pid;
public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) {
return sp != null ? new TypedValue()
.setValue(sp.getValue())
.setType(sp.getQualifier().getClassid()) : null;
}
public static final Pair<String, String> oafSubjectToPair(final StructuredProperty sp) {
return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
return sp != null ? Pair.of(sp.getQualifier().getClassid(), sp.getValue()) : null;
}
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset();
// TODO
return d != null ? new eu.dnetlib.broker.objects.Dataset()
.setOriginalId(d.getOriginalId().get(0))
.setTitle(structPropValue(d.getTitle()))
.setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
d
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
: null;
}
public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) {
return result != null ? new OpenaireBrokerResult()
.setOpenaireId(result.getId())
.setOriginalId(result.getOriginalId().get(0))
.setTypology(result.getResulttype().getClassid())
.setTitles(structPropList(result.getTitle()))
.setAbstracts(fieldList(result.getDescription()))
.setLanguage(result.getLanguage().getClassid())
.setSubjects(structPropTypedList(result.getSubject()))
.setCreators(
result.getAuthor().stream().map(ConversionUtils::oafAuthorToBrokerAuthor).collect(Collectors.toList()))
.setPublicationdate(result.getDateofacceptance().getValue())
.setPublisher(fieldValue(result.getPublisher()))
.setEmbargoenddate(fieldValue(result.getEmbargoenddate()))
.setContributor(fieldList(result.getContributor()))
.setJournal(
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null)
.setCollectedFromId(result.getCollectedfrom().stream().map(KeyValue::getKey).findFirst().orElse(null))
.setCollectedFromName(result.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
.setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
result
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setExternalReferences(
result
.getExternalReference()
.stream()
.map(ConversionUtils::oafExtRefToBrokerExtRef)
.collect(Collectors.toList()))
: null;
}
private static List<TypedValue> structPropTypedList(final List<StructuredProperty> list) {
return list
.stream()
.map(
p -> new TypedValue()
.setValue(p.getValue())
.setType(p.getQualifier().getClassid()))
.collect(Collectors.toList());
}
private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) {
return author != null ? new eu.dnetlib.broker.objects.Author()
.setFullname(author.getFullname())
.setOrcid(
author
.getPid()
.stream()
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.map(pid -> pid.getValue())
.findFirst()
.orElse(null))
: null;
}
private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) {
return journal != null ? new eu.dnetlib.broker.objects.Journal()
.setName(journal.getName())
.setIssn(journal.getIssnPrinted())
.setEissn(journal.getIssnOnline())
.setLissn(journal.getIssnLinking()) : null;
}
private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
return ref != null ? new eu.dnetlib.broker.objects.ExternalReference()
.setRefidentifier(ref.getRefidentifier())
.setSitename(ref.getSitename())
.setType(ref.getQualifier().getClassid())
.setUrl(ref.getUrl())
: null;
}
public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) {
if (p == null) {
return null;
}
final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project()
.setTitle(fieldValue(p.getTitle()))
.setAcronym(fieldValue(p.getAcronym()))
.setCode(fieldValue(p.getCode()));
final String ftree = fieldValue(p.getFundingtree());
if (StringUtils.isNotBlank(ftree)) {
try {
final Document fdoc = DocumentHelper.parseText(ftree);
res.setFunder(fdoc.valueOf("/fundingtree/funder/shortname"));
res.setJurisdiction(fdoc.valueOf("/fundingtree/funder/jurisdiction"));
res.setFundingProgram(fdoc.valueOf("//funding_level_0/name"));
} catch (final DocumentException e) {
log.error("Error in record " + p.getId() + ": invalid fundingtree: " + ftree);
}
}
return res;
}
public static final eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication d) {
final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication();
// TODO
return res;
public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) {
return sw != null ? new eu.dnetlib.broker.objects.Software()
.setName(structPropValue(sw.getTitle()))
.setDescription(fieldValue(sw.getDescription()))
.setRepository(fieldValue(sw.getCodeRepositoryUrl()))
.setLandingPage(fieldValue(sw.getDocumentationUrl()))
: null;
}
private static String fieldValue(final Field<String> f) {
return f != null ? f.getValue() : null;
}
private static String fieldValue(final List<Field<String>> fl) {
return fl != null ? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
: null;
}
private static String structPropValue(final List<StructuredProperty> props) {
return props != null
? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
: null;
}
private static List<String> fieldList(final List<Field<String>> fl) {
return fl != null
? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList())
: new ArrayList<>();
}
private static List<String> structPropList(final List<StructuredProperty> props) {
return props != null
? props
.stream()
.map(StructuredProperty::getValue)
.filter(StringUtils::isNotBlank)
.collect(Collectors.toList())
: new ArrayList<>();
}
}

View File

@ -0,0 +1,86 @@
package eu.dnetlib.dhp.broker.oa.util;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.broker.model.EventFactory;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.pace.config.DedupConfig;
public class EventFinder {
private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
static {
matchers.add(new EnrichMissingAbstract());
matchers.add(new EnrichMissingAuthorOrcid());
matchers.add(new EnrichMissingOpenAccess());
matchers.add(new EnrichMissingPid());
matchers.add(new EnrichMissingPublicationDate());
matchers.add(new EnrichMissingSubject());
matchers.add(new EnrichMoreOpenAccess());
matchers.add(new EnrichMorePid());
matchers.add(new EnrichMoreSubject());
// Advanced matchers
matchers.add(new EnrichMissingProject());
matchers.add(new EnrichMoreProject());
matchers.add(new EnrichMissingSoftware());
matchers.add(new EnrichMoreSoftware());
matchers.add(new EnrichMissingPublicationIsRelatedTo());
matchers.add(new EnrichMissingPublicationIsReferencedBy());
matchers.add(new EnrichMissingPublicationReferences());
matchers.add(new EnrichMissingPublicationIsSupplementedTo());
matchers.add(new EnrichMissingPublicationIsSupplementedBy());
matchers.add(new EnrichMissingDatasetIsRelatedTo());
matchers.add(new EnrichMissingDatasetIsReferencedBy());
matchers.add(new EnrichMissingDatasetReferences());
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
matchers.add(new EnrichMissingAbstract());
}
public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) {
final List<UpdateInfo<?>> list = new ArrayList<>();
for (final OpenaireBrokerResult target : results.getData()) {
for (final UpdateMatcher<?> matcher : matchers) {
list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig));
}
}
return asEventGroup(list);
}
private static EventGroup asEventGroup(final List<UpdateInfo<?>> list) {
final EventGroup events = new EventGroup();
list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement);
return events;
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.broker.oa.util;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.dhp.broker.model.Event;
public class EventGroup implements Serializable {
/**
*
*/
private static final long serialVersionUID = 765977943803533130L;
private final List<Event> data = new ArrayList<>();
public List<Event> getData() {
return data;
}
public EventGroup addElement(final Event elem) {
data.add(elem);
return this;
}
public EventGroup addGroup(final EventGroup group) {
data.addAll(group.getData());
return this;
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.broker.oa.util;
public class TrustUtils {
public static float rescale(final double score, final double threshold) {
if (score >= BrokerConstants.MAX_TRUST) {
return BrokerConstants.MAX_TRUST;
}
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST)
/ (BrokerConstants.MAX_TRUST - threshold);
if (val < BrokerConstants.MIN_TRUST) {
return BrokerConstants.MIN_TRUST;
}
if (val > BrokerConstants.MAX_TRUST) {
return BrokerConstants.MAX_TRUST;
}
return (float) val;
}
}

View File

@ -4,10 +4,20 @@ package eu.dnetlib.dhp.broker.oa.util;
import java.util.function.BiConsumer;
import java.util.function.Function;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.OpenAireEventPayload;
import eu.dnetlib.broker.objects.Publication;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Provenance;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
public final class UpdateInfo<T> {
@ -15,43 +25,61 @@ public final class UpdateInfo<T> {
private final T highlightValue;
private final Result source;
private final OpenaireBrokerResult source;
private final Result target;
private final OpenaireBrokerResult target;
private final BiConsumer<Publication, T> compileHighlight;
private final BiConsumer<OpenaireBrokerResult, T> compileHighlight;
private final Function<T, String> highlightToString;
private final float trust;
public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target,
final BiConsumer<Publication, T> compileHighlight,
final Function<T, String> highlightToString) {
private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
public UpdateInfo(final Topic topic, final T highlightValue, final OpenaireBrokerResult source,
final OpenaireBrokerResult target,
final BiConsumer<OpenaireBrokerResult, T> compileHighlight,
final Function<T, String> highlightToString,
final DedupConfig dedupConfig) {
this.topic = topic;
this.highlightValue = highlightValue;
this.source = source;
this.target = target;
this.compileHighlight = compileHighlight;
this.highlightToString = highlightToString;
this.trust = calculateTrust(source, target);
this.trust = calculateTrust(dedupConfig, source, target);
}
public T getHighlightValue() {
return highlightValue;
}
public Result getSource() {
public OpenaireBrokerResult getSource() {
return source;
}
public Result getTarget() {
public OpenaireBrokerResult getTarget() {
return target;
}
private float calculateTrust(final Result source, final Result target) {
// TODO
return 0.9f;
private float calculateTrust(final DedupConfig dedupConfig, final OpenaireBrokerResult r1,
final OpenaireBrokerResult r2) {
try {
final ObjectMapper objectMapper = new ObjectMapper();
final MapDocument doc1 = MapDocumentUtil
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
final MapDocument doc2 = MapDocumentUtil
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
final double threshold = dedupConfig.getWf().getThreshold();
return TrustUtils.rescale(score, threshold);
} catch (final Exception e) {
log.error("Error computing score between results", e);
return BrokerConstants.MIN_TRUST;
}
}
protected Topic getTopic() {
@ -66,12 +94,35 @@ public final class UpdateInfo<T> {
return trust;
}
public void compileHighlight(final OpenAireEventPayload payload) {
compileHighlight.accept(payload.getHighlight(), getHighlightValue());
}
public String getHighlightValueAsString() {
return highlightToString.apply(getHighlightValue());
}
public OpenAireEventPayload asBrokerPayload() {
compileHighlight.accept(target, getHighlightValue());
final OpenaireBrokerResult hl = new OpenaireBrokerResult();
compileHighlight.accept(hl, getHighlightValue());
final String provId = getSource().getOriginalId();
final String provRepo = getSource().getCollectedFromName();
final String provUrl = getSource()
.getInstances()
.stream()
.map(Instance::getUrl)
.findFirst()
.orElse(null);
;
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);
return new OpenAireEventPayload()
.setPublication(target)
.setHighlight(hl)
.setTrust(trust)
.setProvenance(provenance);
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.simple;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup, ResultGroup> {
/**
*
*/
private static final long serialVersionUID = -1492327874705585538L;
@Override
public ResultGroup zero() {
return new ResultGroup();
}
@Override
public ResultGroup reduce(final ResultGroup group, final Tuple2<OpenaireBrokerResult, Relation> t) {
return group.addElement(t._1);
}
@Override
public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) {
return g1.addGroup(g2);
}
@Override
public ResultGroup finish(final ResultGroup group) {
return group;
}
@Override
public Encoder<ResultGroup> bufferEncoder() {
return Encoders.kryo(ResultGroup.class);
}
@Override
public Encoder<ResultGroup> outputEncoder() {
return Encoders.kryo(ResultGroup.class);
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.simple;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
public class ResultGroup implements Serializable {
/**
*
*/
private static final long serialVersionUID = -3360828477088669296L;
private final List<OpenaireBrokerResult> data = new ArrayList<>();
public List<OpenaireBrokerResult> getData() {
return data;
}
public ResultGroup addElement(final OpenaireBrokerResult elem) {
data.add(elem);
return this;
}
public ResultGroup addGroup(final ResultGroup group) {
data.addAll(group.getData());
return this;
}
public boolean isValid() {
return data.size() > 1;
}
}

View File

@ -0,0 +1,69 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import scala.Tuple2;
public class OpenaireBrokerResultAggregator<T>
extends Aggregator<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult, OpenaireBrokerResult> {
/**
*
*/
private static final long serialVersionUID = -3687878788861013488L;
@Override
public OpenaireBrokerResult zero() {
return new OpenaireBrokerResult();
}
@Override
public OpenaireBrokerResult finish(final OpenaireBrokerResult g) {
return g;
}
@Override
public OpenaireBrokerResult reduce(final OpenaireBrokerResult g, final Tuple2<OpenaireBrokerResult, T> t) {
if (g.getOriginalId() == null) {
return t._1;
} else if (t._2 instanceof RelatedSoftware) {
g.getSoftwares().add(((RelatedSoftware) t._2).getRelSoftware());
} else if (t._2 instanceof RelatedDataset) {
g.getDatasets().add(((RelatedDataset) t._2).getRelDataset());
} else if (t._2 instanceof RelatedPublication) {
g.getPublications().add(((RelatedPublication) t._2).getRelPublication());
} else if (t._2 instanceof RelatedProject) {
g.getProjects().add(((RelatedProject) t._2).getRelProject());
}
return g;
}
@Override
public OpenaireBrokerResult merge(final OpenaireBrokerResult g1, final OpenaireBrokerResult g2) {
if (g1.getOriginalId() != null) {
g1.getSoftwares().addAll(g2.getSoftwares());
g1.getDatasets().addAll(g2.getDatasets());
g1.getPublications().addAll(g2.getPublications());
g1.getProjects().addAll(g2.getProjects());
return g1;
} else {
return g2;
}
}
@Override
public Encoder<OpenaireBrokerResult> bufferEncoder() {
return Encoders.kryo(OpenaireBrokerResult.class);
}
@Override
public Encoder<OpenaireBrokerResult> outputEncoder() {
return Encoders.kryo(OpenaireBrokerResult.class);
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Dataset;
public class RelatedDataset implements Serializable {
/**
*
*/
private static final long serialVersionUID = 774487705184038324L;
private final String source;
private final String relType;
private final Dataset relDataset;
public RelatedDataset(final String source, final String relType, final Dataset relDataset) {
this.source = source;
this.relType = relType;
this.relDataset = relDataset;
}
public String getSource() {
return source;
}
public String getRelType() {
return relType;
}
public Dataset getRelDataset() {
return relDataset;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import eu.dnetlib.broker.objects.Dataset;
import eu.dnetlib.broker.objects.Project;
import eu.dnetlib.broker.objects.Publication;
import eu.dnetlib.broker.objects.Software;
public class RelatedEntityFactory {
@SuppressWarnings("unchecked")
public static <RT, T> RT newRelatedEntity(final String sourceId,
final String relType,
final T target,
final Class<RT> clazz) {
if (clazz == RelatedProject.class) {
return (RT) new RelatedProject(sourceId, relType, (Project) target);
}
if (clazz == RelatedSoftware.class) {
return (RT) new RelatedSoftware(sourceId, relType, (Software) target);
}
if (clazz == RelatedDataset.class) {
return (RT) new RelatedDataset(sourceId, relType, (Dataset) target);
}
if (clazz == RelatedPublication.class) {
return (RT) new RelatedPublication(sourceId, relType, (Publication) target);
}
return null;
}
}

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Project;
public class RelatedProject implements Serializable {
/**
*
*/
private static final long serialVersionUID = 4941437626549329870L;
private final String source;
private final String relType;
private final Project relProject;
public RelatedProject(final String source, final String relType, final Project relProject) {
this.source = source;
this.relType = relType;
this.relProject = relProject;
}
public String getSource() {
return source;
}
public String getRelType() {
return relType;
}
public Project getRelProject() {
return relProject;
}
}

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Publication;
public class RelatedPublication implements Serializable {
/**
*
*/
private static final long serialVersionUID = 9021609640411395128L;
private final String source;
private final String relType;
private final Publication relPublication;
public RelatedPublication(final String source, final String relType, final Publication relPublication) {
this.source = source;
this.relType = relType;
this.relPublication = relPublication;
}
public String getSource() {
return source;
}
public String getRelType() {
return relType;
}
public Publication getRelPublication() {
return relPublication;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Software;
public class RelatedSoftware implements Serializable {
/**
*
*/
private static final long serialVersionUID = 7573383356943300157L;
private final String source;
private final String relType;
private final Software relSoftware;
public RelatedSoftware(final String source, final String relType, final Software relSoftware) {
this.source = source;
this.relType = relType;
this.relSoftware = relSoftware;
}
public String getSource() {
return source;
}
public String getRelType() {
return relType;
}
public Software getRelSoftware() {
return relSoftware;
}
}

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,99 @@
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphInputPath</name>
<description>the path where the graph is stored</description>
</property>
<property>
<name>eventsOutputPath</name>
<description>the path where the the events will be stored</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>dedupConfProfId</name>
<description>the id of a valid Dedup Configuration Profile</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="generate_events"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="generate_events">
<java>
<prepare>
<delete path="${eventsOutputPath}"/>
</prepare>
<main-class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</main-class>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--eventsPath</arg><arg>${eventsOutputPath}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,26 @@
[
{
"paramName": "g",
"paramLongName": "graphPath",
"paramDescription": "the path where there the graph is stored",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "eventsPath",
"paramDescription": "the path where the generated events will be stored",
"paramRequired": true
},
{
"paramName": "lu",
"paramLongName": "isLookupUrl",
"paramDescription": "the address of the ISLookUpService",
"paramRequired": true
},
{
"paramName": "d",
"paramLongName": "dedupConfProfile",
"paramDescription": "the id of a valid Dedup Configuration Profile",
"paramRequired": true
}
]

View File

@ -0,0 +1,73 @@
package eu.dnetlib.dhp.broker.oa.util;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
public class TrustUtilsTest {
private static final double THRESHOLD = 0.95;
@Test
public void rescaleTest_1() {
verifyValue(-0.3, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_2() {
verifyValue(0.0, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_3() {
verifyValue(0.5, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_4() {
verifyValue(0.95, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_5() {
verifyValue(0.96, BrokerConstants.MIN_TRUST);
}
@Test
public void rescaleTest_6() {
verifyValue(0.97, 0.3f);
}
@Test
public void rescaleTest_7() {
verifyValue(0.98, 0.45f);
}
@Test
public void rescaleTest_8() {
verifyValue(0.99, 0.6f);
}
@Test
public void rescaleTest_9() {
verifyValue(1.00, BrokerConstants.MAX_TRUST);
}
@Test
public void rescaleTest_10() {
verifyValue(1.01, BrokerConstants.MAX_TRUST);
}
@Test
public void rescaleTest_11() {
verifyValue(2.00, BrokerConstants.MAX_TRUST);
}
private void verifyValue(final double originalScore, final float expectedTrust) {
final float trust = TrustUtils.rescale(originalScore, THRESHOLD);
System.out.println(trust);
assertTrue(Math.abs(trust - expectedTrust) < 0.01);
}
}

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId>

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version>
<version>1.2.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-doiboost</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.4</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-transports-http</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,366 @@
package eu.dnetlib.doiboost
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.codehaus.jackson.map.ObjectMapper
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import scala.io.Source
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
object DoiBoostMappingUtil {
def getUnknownCountry(): Qualifier = {
createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
}
def generateMAGAffiliationId(affId: String): String = {
s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
}
val logger: Logger = LoggerFactory.getLogger(getClass)
//STATIC STRING
val MAG = "microsoft"
val MAG_NAME = "Microsoft Academic Graph"
val ORCID = "ORCID"
val CROSSREF = "Crossref"
val UNPAYWALL = "UnpayWall"
val GRID_AC = "grid.ac"
val WIKPEDIA = "wikpedia"
val doiBoostNSPREFIX = "doiboost____"
val OPENAIRE_PREFIX = "openaire____"
val SEPARATOR = "::"
val DNET_LANGUAGES = "dnet:languages"
val PID_TYPES = "dnet:pid_types"
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
def toActionSet(item:Oaf) :(String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: Dataset =>
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
a.setClazz(classOf[Dataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case organization: Organization =>
val a: AtomicAction[Organization] = new AtomicAction[Organization]
a.setClazz(classOf[Organization])
a.setPayload(organization)
(organization.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def toHostedByItem(input:String): (String, HostedByItemType) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
(c.keys.head, c.values.head)
}
def toISSNPair(publication: Publication) : (String, Publication) = {
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
if (issn!= null && issn.nonEmpty)
(issn, publication)
else if(eissn!= null && eissn.nonEmpty)
(eissn, publication)
else if(lissn!= null && lissn.nonEmpty)
(lissn, publication)
else
(publication.getId, publication)
}
def generateGridAffiliationId(gridId:String) :String = {
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
}
def fixResult(result: Dataset) :Dataset = {
val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
if (instanceType.isDefined) {
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
result.getInstance().asScala.foreach(i => {
i.setHostedby(getUbknownHostedBy())
})
result
}
def getUbknownHostedBy():KeyValue = {
val hb = new KeyValue
hb.setValue("Unknown Repository")
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
hb
}
def getOpenAccessQualifier():Qualifier = {
createQualifier("OPEN","Open Access","dnet:access_modes", "dnet:access_modes")
}
def getRestrictedQualifier():Qualifier = {
createQualifier("RESTRICTED","Restricted","dnet:access_modes", "dnet:access_modes")
}
def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = {
val publication = input._1._2
val item = if (input._2 != null) input._2._2 else null
val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
if (instanceType.isDefined) {
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
}
publication.getInstance().asScala.foreach(i => {
val hb = new KeyValue
if (item != null) {
hb.setValue(item.officialname)
hb.setKey(generateDSId(item.id))
if (item.openAccess)
i.setAccessright(getOpenAccessQualifier())
publication.setBestaccessright(getOpenAccessQualifier())
}
else {
hb.setValue("Unknown Repository")
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
}
i.setHostedby(hb)
})
val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
if (ar.nonEmpty) {
if(ar.contains("OPEN")){
publication.setBestaccessright(getOpenAccessQualifier())
}
else {
publication.setBestaccessright(getRestrictedQualifier())
}
}
publication
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|${b}::${DHPUtils.md5(a)}"
}
def generateDataInfo(): DataInfo = {
generateDataInfo("0.9")
}
def filterPublication(publication: Publication): Boolean = {
//Case empty publication
if (publication == null)
return false
//Case publication with no title
if (publication.getTitle == null || publication.getTitle.size == 0)
return false
val s = publication.getTitle.asScala.count(p => p.getValue != null
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
if (s == 0)
return false
// fixes #4360 (test publisher)
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
return false;
}
//Publication with no Author
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
return false
//filter invalid author
val authors = publication.getAuthor.asScala.map(s => {
if (s.getFullname.nonEmpty) {
s.getFullname
}
else
s"${
s.getName
} ${
s.getSurname
}"
})
val c = authors.count(isValidAuthorName)
if (c == 0)
return false
// fixes #4368
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
return false
true
}
def isValidAuthorName(fullName: String): Boolean = {
if (fullName == null || fullName.isEmpty)
return false
if (invalidName.contains(fullName.toLowerCase.trim))
return false
true
}
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
di
}
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp
}
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
}
def createCrossrefCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(CROSSREF)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
cf
}
def createUnpayWallCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(UNPAYWALL)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
cf
}
def createORIDCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(ORCID)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
cf
}
def generateIdentifier (oaf: Result, doi: String): String = {
val id = DHPUtils.md5 (doi.toLowerCase)
return s"50|${
doiBoostNSPREFIX
}${
SEPARATOR
}${
id
}"
}
def createMAGCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(MAG_NAME)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
cf
}
def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
val q = new Qualifier
q.setClassid(clsName)
q.setClassname(clsValue)
q.setSchemeid(schName)
q.setSchemename(schValue)
q
}
def createQualifier(cls: String, sch: String): Qualifier = {
createQualifier(cls, cls, sch, sch)
}
def asField[T](value: T): Field[T] = {
val tmp = new Field[T]
tmp.setValue(value)
tmp
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkGenerateDOIBoostActionSet {
val logger: Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
val dbPublicationPath = parser.get("dbPublicationPath")
val dbDatasetPath = parser.get("dbDatasetPath")
val crossRefRelation = parser.get("crossRefRelation")
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
val dbOrganizationPath = parser.get("dbOrganizationPath")
val workingDirPath = parser.get("targetPath")
spark.read.load(dbDatasetPath).as[OafDataset]
.map(d =>DoiBoostMappingUtil.fixResult(d))
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
spark.read.load(dbPublicationPath).as[Publication]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(dbOrganizationPath).as[Organization]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(crossRefRelation).as[Relation]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(dbaffiliationRelationPath).as[Relation]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

View File

@ -0,0 +1,140 @@
package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset, Organization}
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkGenerateDoiBoost {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
import spark.implicits._
val hostedByMapPath = parser.get("hostedByMapPath")
val workingDirPath = parser.get("workingDirPath")
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
logger.info("Phase 2) Join Crossref with UnpayWall")
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
def applyMerge(item:((String, Publication), (String, Publication))) : Publication =
{
val crossrefPub = item._1._2
if (item._2!= null) {
val otherPub = item._2._2
if (otherPub != null) {
crossrefPub.mergeFrom(otherPub)
}
}
crossrefPub
}
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
logger.info("Phase 3) Join Result with ORCID")
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
logger.info("Phase 3) Join Result with MAG")
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
val doiBoostPublication: Dataset[(String,Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
val hostedByDataset : Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
.map(DoiBoostMappingUtil.fixPublication)
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
val affiliationPath = parser.get("affiliationPath")
val paperAffiliationPath = parser.get("paperAffiliationPath")
val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
val a:Dataset[DoiBoostAffiliation] = paperAffiliation
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
val pub:Publication = item._1._2
val affiliation = item._2
val affId:String = if (affiliation.GridId.isDefined) DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId.get) else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
val r:Relation = new Relation
r.setSource(pub.getId)
r.setTarget(affId)
r.setRelType("resultOrganization")
r.setRelClass("hasAuthorInstitution")
r.setSubRelType("affiliation")
r.setDataInfo(pub.getDataInfo)
r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
val r1:Relation = new Relation
r1.setTarget(pub.getId)
r1.setSource(affId)
r1.setRelType("resultOrganization")
r1.setRelClass("isAuthorInstitutionOf")
r1.setSubRelType("affiliation")
r1.setDataInfo(pub.getDataInfo)
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
List(r, r1)
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
val affiliation = item._2
if (affiliation.GridId.isEmpty) {
val o = new Organization
o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
if (affiliation.DisplayName.nonEmpty)
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
if (affiliation.OfficialPage.isDefined)
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
o
}
else
null
}).filter(o=> o!=null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
}
}

View File

@ -0,0 +1,445 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.apache.commons.lang.StringUtils
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
case class mappingAffiliation(name: String) {}
case class mappingAuthor(given: Option[String], family: String, ORCID: Option[String], affiliation: Option[mappingAffiliation]) {}
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
case object Crossref2Oaf {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val mappingCrossrefType = Map(
"book-section" -> "publication",
"book" -> "publication",
"book-chapter" -> "publication",
"book-part" -> "publication",
"book-series" -> "publication",
"book-set" -> "publication",
"book-track" -> "publication",
"edited-book" -> "publication",
"reference-book" -> "publication",
"monograph" -> "publication",
"journal-article" -> "publication",
"dissertation" -> "publication",
"other" -> "publication",
"peer-review" -> "publication",
"proceedings" -> "publication",
"proceedings-article" -> "publication",
"reference-entry" -> "publication",
"report" -> "publication",
"report-series" -> "publication",
"standard" -> "publication",
"standard-series" -> "publication",
"posted-content" -> "publication",
"dataset" -> "dataset"
)
val mappingCrossrefSubType = Map(
"book-section" -> "0013 Part of book or chapter of book",
"book" -> "0002 Book",
"book-chapter" -> "0013 Part of book or chapter of book",
"book-part" -> "0013 Part of book or chapter of book",
"book-series" -> "0002 Book",
"book-set" -> "0002 Book",
"book-track" -> "0002 Book",
"edited-book" -> "0002 Book",
"reference-book" -> "0002 Book",
"monograph" -> "0002 Book",
"journal-article" -> "0001 Article",
"dissertation" -> "0006 Doctoral thesis",
"other" -> "0038 Other literature type",
"peer-review" -> "0015 Review",
"proceedings" -> "0004 Conference object",
"proceedings-article" -> "0004 Conference object",
"reference-entry" -> "0013 Part of book or chapter of book",
"report" -> "0017 Report",
"report-series" -> "0017 Report",
"standard" -> "0038 Other literature type",
"standard-series" -> "0038 Other literature type",
"dataset" -> "0021 Dataset",
"preprint" -> "0016 Preprint",
"report" -> "0017 Report"
)
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID
val doi: String = (json \ "DOI").extract[String]
result.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
//MAPPING Crossref DOI into OriginalId
//and Other Original Identifier of dataset like clinical-trial-number
val clinicalTrialNumbers = for (JString(ctr) <- json \ "clinical-trial-number") yield ctr
val alternativeIds = for (JString(ids) <- json \ "alternative-id") yield ids
val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi)
result.setOriginalId(tmp.filter(id => id != null).asJava)
//Set identifier as {50|60} | doiboost____::md5(DOI)
result.setId(generateIdentifier(result, doi))
// Add DataInfo
result.setDataInfo(generateDataInfo())
result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
result.setCollectedfrom(List(createCrossrefCollectedFrom()).asJava)
// Publisher ( Name of work's publisher mapped into Result/Publisher)
val publisher = (json \ "publisher").extractOrElse[String](null)
if (publisher!= null && publisher.nonEmpty)
result.setPublisher(asField(publisher))
// TITLE
val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", "dnet:dataCite_title")
val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", "dnet:dataCite_title")
val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", "dnet:dataCite_title")
val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", "dnet:dataCite_title")
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
result.setDescription(descriptionList.asJava)
// Source
val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source)
result.setSource(sourceList.asJava)
//RELEVANT DATE Mapping
val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", "dnet:dataCite_date")
val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", "dnet:dataCite_date")
val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", "dnet:dataCite_date")
val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", "dnet:dataCite_date")
val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", "dnet:dataCite_date")
val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]])
if (StringUtils.isNotBlank(issuedDate)) {
result.setDateofacceptance(asField(issuedDate))
}
else {
result.setDateofacceptance(asField(createdDate.getValue))
}
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
//Mapping Subject
val subjectList:List[String] = (json \ "subject").extractOrElse[List[String]](List())
if (subjectList.nonEmpty) {
result.setSubject(subjectList.map(s=> createSP(s, "keywords", "dnet:subject_classification_typologies")).asJava)
}
//Mapping Author
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
result.setAuthor(authorList.map(a => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull)).asJava)
// Mapping instance
val instance = new Instance()
val license = for {
JString(lic) <- json \ "license" \ "URL"
} yield asField(lic)
val l = license.filter(d => StringUtils.isNotBlank(d.getValue))
if (l.nonEmpty)
instance.setLicense(l.head)
val has_review = (json \ "relation" \"has-review" \ "id")
if(has_review != JNothing) {
instance.setRefereed(
createQualifier("0001", "peerReviewed", "dnet:review_levels", "dnet:review_levels"))
}
instance.setAccessright(getRestrictedQualifier())
result.setInstance(List(instance).asJava)
instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource"))
result.setResourcetype(createQualifier(cobjCategory.substring(0, 4),"dnet:dataCite_resource"))
instance.setCollectedfrom(createCrossrefCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) {
instance.setDateofacceptance(asField(issuedDate))
}
else {
instance.setDateofacceptance(asField(createdDate.getValue))
}
val s: String = (json \ "URL").extract[String]
val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct
if (links.nonEmpty)
instance.setUrl(links.asJava)
result
}
def generateAuhtor(given: String, family: String, orcid: String): Author = {
val a = new Author
a.setName(given)
a.setSurname(family)
a.setFullname(s"$given $family")
if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
a
}
def convert(input: String): List[Oaf] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
var resultList: List[Oaf] = List()
val objectType = (json \ "type").extractOrElse[String](null)
val objectSubType = (json \ "subtype").extractOrElse[String](null)
if (objectType == null)
return resultList
val result = generateItemFromType(objectType, objectSubType)
if (result == null)
return List()
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"));
mappingResult(result, json, cOBJCategory)
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
if (funderList.nonEmpty) {
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
}
result match {
case publication: Publication => convertPublication(publication, json, cOBJCategory)
case dataset: Dataset => convertDataset(dataset)
}
resultList = resultList ::: List(result)
resultList
}
def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = {
val queue = new mutable.Queue[Relation]
def snsfRule(award:String): String = {
var tmp1 = StringUtils.substringAfter(award,"_")
val tmp2 = StringUtils.substringBefore(tmp1,"/")
logger.debug(s"From $award to $tmp2")
tmp2
}
def extractECAward(award: String): String = {
val awardECRegex: Regex = "[0-9]{4,9}".r
if (awardECRegex.findAllIn(award).hasNext)
return awardECRegex.findAllIn(award).max
null
}
def generateRelation(sourceId:String, targetId:String, nsPrefix:String) :Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(s"$nsPrefix::$targetId")
r.setRelType("resultProject")
r.setRelClass("isProducedBy")
r.setSubRelType("outcome")
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r.setLastupdatetimestamp(ts)
r
}
def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = {
if (funder.award.isDefined && funder.award.get.nonEmpty)
funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach(
award => {
val targetId = DHPUtils.md5(award)
queue += generateRelation(sourceId, targetId, nsPrefix)
}
)
}
if (funders != null)
funders.foreach(funder => {
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
funder.DOI.get match {
case "10.13039/100010663" |
"10.13039/100010661" |
"10.13039/501100007601" |
"10.13039/501100000780" |
"10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100011199" |
"10.13039/100004431" |
"10.13039/501100004963" |
"10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
case "10.13039/501100000038"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "nserc_______" )
case "10.13039/501100000155"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "sshrc_______" )
case "10.13039/501100000024"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "cihr________" )
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a)
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a)
queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "miur________" )
case "10.13039/501100006588" |
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
case "10.13039/100004440"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
case _ => logger.debug("no match for "+funder.DOI.get )
}
} else {
funder.name match {
case "European Unions Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
case "European Union's" =>
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
case "The French National Research Agency (ANR)" |
"The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
case "Wellcome Trust Masters Fellowship" => queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
case _ => logger.debug("no match for "+funder.name )
}
}
}
)
queue.toList
}
def convertDataset(dataset: Dataset): Unit = {
// TODO check if there are other info to map into the Dataset
}
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct
//Mapping book
if (cobjCategory.toLowerCase.contains("book")) {
val ISBN = for {JString(isbn) <- json \ "ISBN"} yield isbn
if (ISBN.nonEmpty && containerTitles.nonEmpty) {
val source = s"${containerTitles.head} ISBN: ${ISBN.head}"
if (publication.getSource != null) {
val l: List[Field[String]] = publication.getSource.asScala.toList
val ll: List[Field[String]] = l ::: List(asField(source))
publication.setSource(ll.asJava)
}
else
publication.setSource(List(asField(source)).asJava)
}
} else {
// Mapping Journal
val issnInfos = for {JArray(issn_types) <- json \ "issn-type"
JObject(issn_type) <- issn_types
JField("type", JString(tp)) <- issn_type
JField("value", JString(vl)) <- issn_type
} yield Tuple2(tp, vl)
val volume = (json \ "volume").extractOrElse[String](null)
if (containerTitles.nonEmpty) {
val journal = new Journal
journal.setName(containerTitles.head)
if (issnInfos.nonEmpty) {
issnInfos.foreach(tp => {
tp._1 match {
case "electronic" => journal.setIssnOnline(tp._2)
case "print" => journal.setIssnPrinted(tp._2)
}
})
}
journal.setVol(volume)
val page = (json \ "page").extractOrElse[String](null)
if (page != null) {
val pp = page.split("-")
if (pp.nonEmpty)
journal.setSp(pp.head)
if (pp.size > 1)
journal.setEp(pp(1))
}
publication.setJournal(journal)
}
}
}
def extractDate(dt: String, datePart: List[List[Int]]): String = {
if (StringUtils.isNotBlank(dt))
return dt
if (datePart != null && datePart.size == 1) {
val res = datePart.head
if (res.size == 3) {
val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d"
if (dp.length == 10) {
return dp
}
}
}
null
}
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
val dp = extractDate(dt, datePart)
if (StringUtils.isNotBlank(dp))
return createSP(dp, classId, schemeId)
null
}
def generateItemFromType(objectType: String, objectSubType: String): Result = {
if (mappingCrossrefType.contains(objectType)) {
if (mappingCrossrefType(objectType).equalsIgnoreCase("publication"))
return new Publication()
if (mappingCrossrefType(objectType).equalsIgnoreCase("dataset"))
return new Dataset()
}
null
}
}

View File

@ -0,0 +1,103 @@
package eu.dnetlib.doiboost.crossref;
import java.io.ByteArrayOutputStream;
import java.util.zip.Inflater;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class CrossrefImporter {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
CrossrefImporter.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/import_from_es.json")));
parser.parseArgument(args);
final String hdfsuri = parser.get("namenode");
System.out.println("HDFS URI" + hdfsuri);
Path hdfswritepath = new Path(parser.get("targetPath"));
System.out.println("TargetPath: " + hdfsuri);
final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
? Long.parseLong(parser.get("timestamp"))
: -1;
if (timestamp > 0)
System.out.println("Timestamp added " + timestamp);
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
ESClient client = timestamp > 0
? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfswritepath),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class))) {
int i = 0;
long start = System.currentTimeMillis();
long end = 0;
final IntWritable key = new IntWritable(i);
final Text value = new Text();
while (client.hasNext()) {
key.set(i++);
value.set(client.next());
writer.append(key, value);
if (i % 100000 == 0) {
end = System.currentTimeMillis();
final float time = (end - start) / 1000.0F;
System.out
.println(
String.format("Imported %d records last 100000 imported in %f seconds", i, time));
start = System.currentTimeMillis();
}
}
}
}
public static String decompressBlob(final String blob) {
try {
byte[] byteArray = Base64.decodeBase64(blob.getBytes());
final Inflater decompresser = new Inflater();
decompresser.setInput(byteArray);
final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
byte[] buffer = new byte[8192];
while (!decompresser.finished()) {
int size = decompresser.inflate(buffer);
bos.write(buffer, 0, size);
}
byte[] unzippeddata = bos.toByteArray();
decompresser.end();
return new String(unzippeddata);
} catch (Throwable e) {
throw new RuntimeException("Wrong record:" + blob, e);
}
}
}

View File

@ -0,0 +1,114 @@
package eu.dnetlib.doiboost.crossref;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.jayway.jsonpath.JsonPath;
public class ESClient implements Iterator<String> {
private static final Logger logger = LoggerFactory.getLogger(ESClient.class);
static final String blobPath = "$.hits[*].hits[*]._source.blob";
static final String scrollIdPath = "$._scroll_id";
static final String JSON_NO_TS = "{\"size\":1000}";
static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
private final String scrollId;
private List<String> buffer;
private final String esHost;
public ESClient(final String esHost, final String esIndex) throws IOException {
this.esHost = esHost;
final String body = getResponse(
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
scrollId = getJPathString(scrollIdPath, body);
buffer = getBlobs(body);
}
public ESClient(final String esHost, final String esIndex, final long timestamp)
throws IOException {
this.esHost = esHost;
final String body = getResponse(
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
String.format(JSON_WITH_TS, timestamp));
scrollId = getJPathString(scrollIdPath, body);
buffer = getBlobs(body);
}
private String getResponse(final String url, final String json) {
CloseableHttpClient client = HttpClients.createDefault();
try {
HttpPost httpPost = new HttpPost(url);
if (json != null) {
StringEntity entity = new StringEntity(json);
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
}
CloseableHttpResponse response = client.execute(httpPost);
return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) {
throw new RuntimeException("Error on executing request ", e);
} finally {
try {
client.close();
} catch (IOException e) {
throw new RuntimeException("Unable to close client ", e);
}
}
}
private String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
return null;
} catch (Exception e) {
return "";
}
}
private List<String> getBlobs(final String body) {
final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
return res;
}
@Override
public boolean hasNext() {
return (buffer != null && !buffer.isEmpty());
}
@Override
public String next() {
final String nextItem = buffer.remove(0);
if (buffer.isEmpty()) {
final String json_param = String.format(JSON_SCROLL, scrollId);
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
try {
buffer = getBlobs(body);
} catch (Throwable e) {
logger.error("Error on get next page: body:" + body);
}
}
return nextItem;
}
}

View File

@ -0,0 +1,98 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
case class Reference(author: String, firstPage: String) {}
object SparkMapDumpIntoOAF {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
val sc = spark.sparkContext
val targetPath = parser.get("targetPath")
sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
.map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
.flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
.map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
var r = if (p1 == null) p2 else p1
if (p1 != null && p2 != null) {
if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
r = p2
else
r = p1
} else {
r = if (p1.getLastupdatetimestamp == null) p2 else p1
}
}
r
}.map(_._2)
val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
.map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
var r = if (p1 == null) p2 else p1
if (p1 != null && p2 != null) {
if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
r = p2
else
r = p1
} else {
r = if (p1.getLastupdatetimestamp == null) p2 else p1
}
}
r
}.map(_._2)
spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
.map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
.reduceByKey { case (p1: Relation, p2: Relation) =>
if (p1 == null) p2 else p1
}.map(_._2)
val rels: Dataset[Relation] = spark.createDataset(distinctRels)
rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
}
}

View File

@ -0,0 +1,316 @@
package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
DocType: String, PaperTitle: String, OriginalTitle: String,
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
Volume: String, Issue: String, FirstPage: String, LastPage: String,
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
case class MagAuthorAffiliation(author: MagAuthor, affiliation:String)
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String) {}
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
case class MagUrlInstance(SourceUrl:String){}
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
case class MagFieldOfStudy(PaperId:Long, subjects:List[MagSubject]) {}
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){}
case object ConversionUtil {
def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={
val magIDRegex: Regex = "^[0-9]+$".r
val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext)
if (s.nonEmpty)
return s.head
null
}
def mergePublication(a: Publication, b:Publication) : Publication = {
if ((a != null) && (b != null)) {
a.mergeFrom(b)
a
} else {
if (a == null) b else a
}
}
def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {
var r = if (p1 == null) p2 else p1
if (p1 != null && p2 != null) {
if (p1.CreatedDate != null && p2.CreatedDate != null) {
if (p1.CreatedDate.before(p2.CreatedDate))
r = p2
else
r = p1
} else {
r = if (p1.CreatedDate == null) p2 else p1
}
}
r
}
def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = {
val pub = inputItem._1._2
val abst = inputItem._2
if (abst != null) {
pub.setDescription(List(asField(abst.IndexedAbstract)).asJava)
}
pub
}
def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = {
val publication:Publication= inputItem._1._2
val ci:MagConferenceInstance = inputItem._2
if (ci!= null){
val j:Journal = new Journal
if (ci.Location.isDefined)
j.setConferenceplace(ci.Location.get)
j.setName(ci.DisplayName.get)
if (ci.StartDate.isDefined && ci.EndDate.isDefined)
{
j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
}
publication.setJournal(j)
}
publication
}
def updatePubsWithSubject(item:((String, Publication), MagFieldOfStudy)) : Publication = {
val publication = item._1._2
val fieldOfStudy = item._2
if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) {
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSP(s.DisplayName, "keyword", "dnet:subject_classification_typologies")
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1)
if (s.MainType.isDefined) {
val maintp = s.MainType.get
val s2 = createSP(s.MainType.get, "keyword", "dnet:subject_classification_typologies")
s2.setDataInfo(di)
resList = resList ::: List(s2)
if (maintp.contains(".")) {
val s3 = createSP(maintp.split("\\.").head, "keyword", "dnet:subject_classification_typologies")
s3.setDataInfo(di)
resList = resList ::: List(s3)
}
}
resList
})
publication.setSubject(p.asJava)
}
publication
}
def addInstances(a: (Publication, MagUrl)): Publication = {
val pub = a._1
val urls = a._2
val i = new Instance
if (urls!= null) {
val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
i.setUrl(l.asJava)
}
else
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
i.setCollectedfrom(createMAGCollectedFrom())
pub.setInstance(List(i).asJava)
pub
}
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
}
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
val paper = inputParams._1._1
val journal = inputParams._1._2
val authors = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
//Set identifier as 50|doiboost____::md5(DOI)
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")
pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava)
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setFullname(f.author.DisplayName.get)
if(f.affiliation!= null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava)
a
}
pub.setAuthor(authorsOAF.asJava)
if (paper.Date != null && paper.Date.isDefined) {
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0,10)))
}
pub.setPublisher(asField(paper.Publisher))
if (journal != null && journal.DisplayName.isDefined) {
val j = new Journal
j.setName(journal.DisplayName.get)
j.setSp(paper.FirstPage)
j.setEp(paper.LastPage)
if (journal.Publisher.isDefined)
pub.setPublisher(asField(journal.Publisher.get))
if (journal.Issn.isDefined)
j.setIssnPrinted(journal.Issn.get)
j.setVol(paper.Volume)
j.setIss(paper.Issue)
pub.setJournal(j)
}
pub.setCollectedfrom(List(createMAGCollectedFrom()).asJava)
pub.setDataInfo(generateDataInfo())
pub
}
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
val paper = inputParams._1._1
val authors = inputParams._1._2
val description = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
//Set identifier as 50 | doiboost____::md5(DOI)
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")
pub.setTitle(List(mainTitles, originalTitles).asJava)
pub.setSource(List(asField(paper.BookTitle)).asJava)
if (description != null) {
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
}
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
a.setFullname(f.author.DisplayName.get)
if(f.affiliation!= null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava)
a
}
if (paper.Date != null) {
pub.setDateofacceptance(asField(paper.Date.toString.substring(0,10)))
}
pub.setAuthor(authorsOAF.asJava)
pub
}
def convertInvertedIndexString(json_input: String): String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(json_input)
val idl = (json \ "IndexLength").extract[Int]
if (idl > 0) {
val res = Array.ofDim[String](idl)
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
for {(k: String, v: List[Int]) <- iid} {
v.foreach(item => res(item) = k)
}
(0 until idl).foreach(i => {
if (res(i) == null)
res(i) = ""
})
return res.mkString(" ")
}
""
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types._
import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.functions._
object SparkImportMagIntoDataset {
val datatypedict = Map(
"int" -> IntegerType,
"uint" -> IntegerType,
"long" -> LongType,
"ulong" -> LongType,
"float" -> FloatType,
"string" -> StringType,
"DateTime" -> DateType
)
val stream = Map(
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
"PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")),
"PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")),
"PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")),
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
)
def getSchema(streamName: String): StructType = {
var schema = new StructType()
val d: Seq[String] = stream(streamName)._2
d.foreach { case t =>
val currentType = t.split(":")
val fieldName: String = currentType.head
var fieldType: String = currentType.last
val nullable: Boolean = fieldType.endsWith("?")
if (nullable)
fieldType = fieldType.replace("?", "")
schema = schema.add(StructField(fieldName, datatypedict(fieldType), nullable))
}
schema
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
stream.foreach { case (k, v) =>
val s: StructType = getSchema(k)
val df = spark.read
.option("header", "false")
.option("charset", "UTF8")
.option("delimiter", "\t")
.schema(s)
.csv(s"${parser.get("sourcePath")}/${v._1}")
logger.info(s"Converting $k")
df.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/$k")
}
}
}

View File

@ -0,0 +1,157 @@
package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkPreProcessMAG {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninque DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
.rdd
.map{ p: MagPapers => Tuple2(p.Doi, p) }
.reduceByKey((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2))
.map(_._2)
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
logger.info("Phase 6) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null)) }
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
.map(s => {
val mpa = s._1._2
val af = s._2
if (af != null) {
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName)
} else
mpa
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
var magPubs: Dataset[(String, Publication)] =
spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val conference = spark.read.load(s"$sourcePath/ConferenceInstances")
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate",$"EndDate" )
val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate",$"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
.write
.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/merge_step_2_conference")
magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
logger.info("Phase 5) enrich publication with URL and Instances")
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
.write.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/merge_step_3")
// logger.info("Phase 6) Enrich Publication with description")
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithDescription(item)
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
.groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects"))
.as[MagFieldOfStudy]
magPubs.joinWith(paperField, col("_1")
.equalTo(paperField("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithSubject(item))
.write.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/mag_publication")
val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
.map(_._2)
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
}
}

View File

@ -0,0 +1,146 @@
package eu.dnetlib.doiboost.orcid;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
import eu.dnetlib.doiboost.orcid.model.WorkData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
public class ActivitiesDecompressor {
private static final int MAX_XML_WORKS_PARSED = -1;
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
parseTarActivities(fs, conf, gzipInputStream, outputPath);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
private static void parseTarActivities(
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
int counter = 0;
int doiFound = 0;
int errorFromOrcidFound = 0;
int xmlParserErrorFound = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class))) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
try {
if (entry.isDirectory() || !filename.contains("works")) {
} else {
Log.debug("XML work entry name: " + entry.getName());
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
// tarInput
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
WorkData workData = XMLRecordParser.VTDParseWorkData(buffer.toString().getBytes());
if (workData != null) {
if (workData.getErrorCode() != null) {
errorFromOrcidFound += 1;
Log
.debug(
"error from Orcid with code "
+ workData.getErrorCode()
+ " for entry "
+ entry.getName());
continue;
}
if (workData.isDoiFound()) {
String jsonData = JsonWriter.create(workData);
Log.debug("oid: " + workData.getOid() + " data: " + jsonData);
final Text key = new Text(workData.getOid());
final Text value = new Text(jsonData);
try {
writer.append(key, value);
} catch (IOException e) {
Log.debug("Writing to sequence file: " + e.getMessage());
Log.debug(e);
throw new RuntimeException(e);
}
doiFound += 1;
}
} else {
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
xmlParserErrorFound += 1;
}
}
} catch (Exception e) {
Log
.warn(
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
Log.warn(e);
}
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
Log.info("Current xml works parsed: " + counter);
}
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
break;
}
}
}
} catch (IOException e) {
Log.warn("Parsing work from gzip archive: " + e.getMessage());
Log.warn(e);
throw new RuntimeException(e);
}
Log.info("Activities parse completed");
Log.info("Total XML works parsed: " + counter);
Log.info("Total doi found: " + doiFound);
Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
}
}

View File

@ -0,0 +1,81 @@
package eu.dnetlib.doiboost.orcid
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
import org.apache.commons.lang.StringUtils
import org.codehaus.jackson.map.ObjectMapper
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){}
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
object ORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
val mapper = new ObjectMapper
def isJsonValid(inputStr: String): Boolean = {
import java.io.IOException
try {
mapper.readTree(inputStr)
true
} catch {
case e: IOException =>
false
}
}
def extractValueFromInputString(input: String): (String, String) = {
val i = input.indexOf('[')
if (i <5) {
return null
}
val orcidList = input.substring(i, input.length - 1)
val doi = input.substring(1, i - 1)
if (isJsonValid(orcidList)) {
(doi, orcidList)
} else null
}
def convertTOOAF(input:ORCIDElement) :Publication = {
val doi = input.doi
val pub:Publication = new Publication
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo())
pub.setId(generateIdentifier(pub, doi.toLowerCase))
try{
pub.setAuthor(input.authors.map(a=> {
generateAuthor(a.name, a.surname, a.creditName, a.oid)
}).asJava)
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
pub
} catch {
case e: Throwable =>
logger.info(s"ERROR ON GENERATE Publication from $input")
null
}
}
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
val a = new Author
a.setName(given)
a.setSurname(family)
if (fullName!= null && fullName.nonEmpty)
a.setFullname(fullName)
else
a.setFullname(s"$given $family")
if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
a
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.doiboost.orcid;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
private String activitiesFileNameTarGz;
private String outputAuthorsDOIsPath;
public static void main(String[] args) throws IOException, Exception {
OrcidAuthorsDOIsDataGen orcidAuthorsDOIsDataGen = new OrcidAuthorsDOIsDataGen();
orcidAuthorsDOIsDataGen.loadArgs(args);
orcidAuthorsDOIsDataGen.generateAuthorsDOIsData();
}
public void generateAuthorsDOIsData() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
OrcidAuthorsDOIsDataGen.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
Log.info("Output Authors DOIs Data: " + outputAuthorsDOIsPath);
}
}

View File

@ -0,0 +1,81 @@
package eu.dnetlib.doiboost.orcid;
import java.io.IOException;
import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidDSManager {
protected String hdfsServerUri;
protected String hdfsOrcidDefaultPath;
private String summariesFileNameTarGz;
private String outputAuthorsPath;
public static void main(String[] args) throws IOException, Exception {
OrcidDSManager orcidDSManager = new OrcidDSManager();
orcidDSManager.loadArgs(args);
orcidDSManager.generateAuthors();
}
public void generateAuthors() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
Path outputPath = new Path(
hdfsServerUri
.concat(hdfsOrcidDefaultPath)
.concat(outputAuthorsPath)
.concat("authors.seq"));
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
}
protected Configuration initConfigurationObject() {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
return conf;
}
protected FileSystem initFileSystemObject(Configuration conf) {
// Get the filesystem - HDFS
FileSystem fs = null;
try {
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return fs;
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
OrcidDSManager.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
Log.info("Summaries File Name: " + summariesFileNameTarGz);
outputAuthorsPath = parser.get("outputAuthorsPath");
Log.info("Output Authors Data: " + outputAuthorsPath);
}
}

View File

@ -0,0 +1,203 @@
package eu.dnetlib.doiboost.orcid;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidDownloader extends OrcidDSManager {
static final int REQ_LIMIT = 24;
// static final int REQ_MAX_TEST = 100;
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10000;
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static final String lastUpdate = "2019-09-30 00:00:00";
private String lambdaFileName;
private String outputPath;
private String token;
public static void main(String[] args) throws IOException, Exception {
OrcidDownloader orcidDownloader = new OrcidDownloader();
orcidDownloader.loadArgs(args);
orcidDownloader.parseLambdaFile();
}
private String downloadRecord(String orcidId) {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
CloseableHttpResponse response = client.execute(httpGet);
if (response.getStatusLine().getStatusCode() != 200) {
Log
.warn(
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
return new String("");
}
return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) {
Log.warn("Downloading " + orcidId, e.getMessage());
}
return new String("");
}
public void parseLambdaFile() throws Exception {
int parsedRecordsCounter = 0;
int downloadedRecordsCounter = 0;
int savedRecordsCounter = 0;
long startDownload = 0;
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
Path hdfsreadpath = new Path(lambdaFileUri);
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
Path hdfsoutputPath = new Path(
hdfsServerUri
.concat(hdfsOrcidDefaultPath)
.concat(outputPath)
.concat("orcid_records.seq"));
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfsoutputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class))) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(lambdaFileStream))) {
String line;
int nReqTmp = 0;
startDownload = System.currentTimeMillis();
long startReqTmp = System.currentTimeMillis();
while ((line = br.readLine()) != null) {
parsedRecordsCounter++;
// skip headers line
if (parsedRecordsCounter == 1) {
continue;
}
String[] values = line.split(",");
List<String> recordInfo = Arrays.asList(values);
String orcidId = recordInfo.get(0);
if (isModified(orcidId, recordInfo.get(3))) {
String record = downloadRecord(orcidId);
downloadedRecordsCounter++;
if (!record.isEmpty()) {
String compressRecord = ArgumentApplicationParser.compressArgument(record);
final Text key = new Text(recordInfo.get(0));
final Text value = new Text(compressRecord);
try {
writer.append(key, value);
savedRecordsCounter++;
} catch (IOException e) {
Log.warn("Writing to sequence file: " + e.getMessage());
Log.warn(e);
throw new RuntimeException(e);
}
}
}
long endReq = System.currentTimeMillis();
nReqTmp++;
if (nReqTmp == REQ_LIMIT) {
long reqSessionDuration = endReq - startReqTmp;
if (reqSessionDuration <= 1000) {
Log
.warn(
"\nreqSessionDuration: "
+ reqSessionDuration
+ " nReqTmp: "
+ nReqTmp
+ " wait ....");
Thread.sleep(1000 - reqSessionDuration);
} else {
nReqTmp = 0;
startReqTmp = System.currentTimeMillis();
}
}
// if (parsedRecordsCounter > REQ_MAX_TEST) {
// break;
// }
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
Log
.info(
"Current parsed: "
+ parsedRecordsCounter
+ " downloaded: "
+ downloadedRecordsCounter
+ " saved: "
+ savedRecordsCounter);
// if (parsedRecordsCounter > REQ_MAX_TEST) {
// break;
// }
}
}
long endDownload = System.currentTimeMillis();
long downloadTime = endDownload - startDownload;
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
}
}
lambdaFileStream.close();
Log.info("Download started at: " + new Date(startDownload).toString());
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
Log.info("Saved Records Counter: " + savedRecordsCounter);
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
OrcidDownloader.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
lambdaFileName = parser.get("lambdaFileName");
Log.info("Lambda File Name: " + lambdaFileName);
outputPath = parser.get("outputPath");
Log.info("Output Data: " + outputPath);
token = parser.get("token");
}
private boolean isModified(String orcidId, String modifiedDate) {
Date modifiedDateDt = null;
Date lastUpdateDt = null;
try {
if (modifiedDate.length() != 19) {
modifiedDate = modifiedDate.substring(0, 19);
}
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
} catch (Exception e) {
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
return true;
}
return modifiedDateDt.after(lastUpdateDt);
}
}

View File

@ -0,0 +1,44 @@
package eu.dnetlib.doiboost.orcid
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkConvertORCIDToOAF {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
import spark.implicits._
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
logger.info("Converting ORCID to OAF")
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
.map(_._2)
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -0,0 +1,180 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.esotericsoftware.minlog.Log;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
import scala.Tuple2;
public class SparkGenerateDoiAuthorList {
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkGenerateDoiAuthorList.class);
logger.info("[ SparkGenerateDoiAuthorList STARTED]");
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkGenerateDoiAuthorList.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: ", workingPath);
final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath");
logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark
.createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
Encoders.bean(AuthorData.class));
JavaPairRDD<Text, Text> activitiesRDD = sc
.sequenceFile(workingPath + "/output/*.seq", Text.class, Text.class);
Dataset<WorkData> activitiesDataset = spark
.createDataset(
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
Encoders.bean(WorkData.class));
Function<Tuple2<String, AuthorData>, Tuple2<String, List<AuthorData>>> toAuthorListFunction = data -> {
try {
String doi = data._1();
if (doi == null) {
return null;
}
AuthorData author = data._2();
if (author == null) {
return null;
}
List<AuthorData> toAuthorList = Arrays.asList(author);
return new Tuple2<>(doi, toAuthorList);
} catch (Exception e) {
Log.error("toAuthorListFunction ERROR", e);
return null;
}
};
JavaRDD<Tuple2<String, List<AuthorData>>> doisRDD = activitiesDataset
.joinWith(
summariesDataset,
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
.map(
(MapFunction<Tuple2<WorkData, AuthorData>, Tuple2<String, AuthorData>>) value -> {
WorkData w = value._1;
AuthorData a = value._2;
return new Tuple2<>(w.getDoi(), a);
},
Encoders.tuple(Encoders.STRING(), Encoders.bean(AuthorData.class)))
.filter(Objects::nonNull)
.toJavaRDD()
.map(toAuthorListFunction);
JavaPairRDD
.fromJavaRDD(doisRDD)
.reduceByKey((d1, d2) -> {
try {
if (d1 != null && d2 != null) {
Stream<AuthorData> mergedStream = Stream
.concat(
d1.stream(),
d2.stream());
List<AuthorData> mergedAuthors = mergedStream.collect(Collectors.toList());
return mergedAuthors;
}
if (d1 != null) {
return d1;
}
if (d2 != null) {
return d2;
}
} catch (Exception e) {
Log.error("mergeAuthorsFunction ERROR", e);
return null;
}
return null;
})
.mapToPair(
s -> {
ObjectMapper mapper = new ObjectMapper();
return new Tuple2<>(s._1(), mapper.writeValueAsString(s._2()));
})
.repartition(10)
.saveAsTextFile(workingPath + outputDoiAuthorListPath);
});
}
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
AuthorData authorData = new AuthorData();
authorData.setOid(orcidId.toString());
JsonElement jElement = new JsonParser().parse(json.toString());
authorData.setName(getJsonValue(jElement, "name"));
authorData.setSurname(getJsonValue(jElement, "surname"));
authorData.setCreditName(getJsonValue(jElement, "creditname"));
return authorData;
}
private static WorkData loadWorkFromJson(Text orcidId, Text json) {
WorkData workData = new WorkData();
workData.setOid(orcidId.toString());
JsonElement jElement = new JsonParser().parse(json.toString());
workData.setDoi(getJsonValue(jElement, "doi"));
return workData;
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return null;
}
}

View File

@ -0,0 +1,165 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.util.LongAccumulator;
import org.mortbay.log.Log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
import scala.Tuple2;
public class SparkOrcidGenerateAuthors {
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static final String lastUpdate = "2019-09-30 00:00:00";
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
logger.info("[ SparkOrcidGenerateAuthors STARTED]");
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkOrcidGenerateAuthors.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: ", workingPath);
final String outputAuthorsPath = parser.get("outputAuthorsPath");
logger.info("outputAuthorsPath: ", outputAuthorsPath);
final String token = parser.get("token");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator parsedRecordsAcc = sc.sc().longAccumulator("parsedRecords");
LongAccumulator modifiedRecordsAcc = sc.sc().longAccumulator("modifiedRecords");
LongAccumulator downloadedRecordsAcc = sc.sc().longAccumulator("downloadedRecords");
LongAccumulator alreadyDownloadedRecords = sc.sc().longAccumulator("alreadyDownloadedRecords");
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "lamdafiles");
JavaRDD<String> downloadedRDD = sc.textFile(workingPath + "downloaded");
Function<String, String> getOrcidIdFunction = line -> {
try {
String[] values = line.split(",");
return values[0].substring(1);
} catch (Exception e) {
return new String("");
}
};
List<String> downloadedRecords = downloadedRDD.map(getOrcidIdFunction).collect();
Function<String, Boolean> isModifiedAfterFilter = line -> {
String[] values = line.split(",");
String orcidId = values[0];
parsedRecordsAcc.add(1);
if (isModified(orcidId, values[3])) {
modifiedRecordsAcc.add(1);
return true;
}
return false;
};
Function<String, Boolean> isNotDownloadedFilter = line -> {
String[] values = line.split(",");
String orcidId = values[0];
if (downloadedRecords.contains(orcidId)) {
alreadyDownloadedRecords.add(1);
return false;
}
return true;
};
Function<String, Tuple2<String, String>> downloadRecordFunction = line -> {
String[] values = line.split(",");
String orcidId = values[0];
String modifiedDate = values[3];
return downloadRecord(orcidId, modifiedDate, token, downloadedRecordsAcc);
};
lamdaFileRDD
.filter(isModifiedAfterFilter)
.filter(isNotDownloadedFilter)
.map(downloadRecordFunction)
.rdd()
.saveAsTextFile(workingPath.concat(outputAuthorsPath));
});
}
private static boolean isModified(String orcidId, String modifiedDate) {
Date modifiedDateDt = null;
Date lastUpdateDt = null;
try {
if (modifiedDate.length() != 19) {
modifiedDate = modifiedDate.substring(0, 19);
}
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
} catch (Exception e) {
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
return true;
}
return modifiedDateDt.after(lastUpdateDt);
}
private static Tuple2<String, String> downloadRecord(String orcidId, String modifiedDate, String token,
LongAccumulator downloadedRecordsAcc) {
final DownloadedRecordData data = new DownloadedRecordData();
data.setOrcidId(orcidId);
data.setModifiedDate(modifiedDate);
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
CloseableHttpResponse response = client.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
data.setStatusCode(statusCode);
if (statusCode != 200) {
Log
.warn(
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
return data.toTuple2();
}
downloadedRecordsAcc.add(1);
data
.setCompressedData(
ArgumentApplicationParser.compressArgument(IOUtils.toString(response.getEntity().getContent())));
} catch (Throwable e) {
Log.warn("Downloading " + orcidId, e.getMessage());
data.setErrorMessage(e.getMessage());
return data.toTuple2();
}
return data.toTuple2();
}
}

View File

@ -0,0 +1,50 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class SparkPartitionLambdaFile {
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkOrcidGenerateAuthors.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String workingPath = parser.get("workingPath");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "last_modified.csv");
lamdaFileRDD
.repartition(20)
.saveAsTextFile(workingPath.concat("lamdafiles"));
});
}
}

View File

@ -0,0 +1,158 @@
package eu.dnetlib.doiboost.orcid;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
public class SummariesDecompressor {
private static final int MAX_XML_RECORDS_PARSED = -1;
public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
parseTarSummaries(fs, conf, gzipInputStream, outputPath);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
private static void parseTarSummaries(
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
int counter = 0;
int nameFound = 0;
int surnameFound = 0;
int creditNameFound = 0;
int errorFromOrcidFound = 0;
int xmlParserErrorFound = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class))) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
try {
if (entry.isDirectory()) {
Log.debug("Directory entry name: " + entry.getName());
} else {
Log.debug("XML record entry name: " + entry.getName());
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
// tarInput
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
AuthorData authorData = XMLRecordParser.VTDParseAuthorData(buffer.toString().getBytes());
if (authorData != null) {
if (authorData.getErrorCode() != null) {
errorFromOrcidFound += 1;
Log
.debug(
"error from Orcid with code "
+ authorData.getErrorCode()
+ " for oid "
+ entry.getName());
continue;
}
String jsonData = JsonWriter.create(authorData);
Log.debug("oid: " + authorData.getOid() + " data: " + jsonData);
final Text key = new Text(authorData.getOid());
final Text value = new Text(jsonData);
try {
writer.append(key, value);
} catch (IOException e) {
Log.debug("Writing to sequence file: " + e.getMessage());
Log.debug(e);
throw new RuntimeException(e);
}
if (authorData.getName() != null) {
nameFound += 1;
}
if (authorData.getSurname() != null) {
surnameFound += 1;
}
if (authorData.getCreditName() != null) {
creditNameFound += 1;
}
} else {
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
xmlParserErrorFound += 1;
}
}
} catch (Exception e) {
Log
.warn(
"Parsing record from tar archive and xml record: "
+ filename
+ " "
+ e.getMessage());
Log.warn(e);
}
if ((counter % 100000) == 0) {
Log.info("Current xml records parsed: " + counter);
}
if ((MAX_XML_RECORDS_PARSED > -1) && (counter > MAX_XML_RECORDS_PARSED)) {
break;
}
}
}
} catch (IOException e) {
Log.warn("Parsing record from gzip archive: " + e.getMessage());
Log.warn(e);
throw new RuntimeException(e);
}
Log.info("Summaries parse completed");
Log.info("Total XML records parsed: " + counter);
Log.info("Name found: " + nameFound);
Log.info("Surname found: " + surnameFound);
Log.info("Credit name found: " + creditNameFound);
Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml record found: " + xmlParserErrorFound);
}
}

View File

@ -0,0 +1,28 @@
package eu.dnetlib.doiboost.orcid.json;
import com.google.gson.JsonObject;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
public class JsonWriter {
public static String create(AuthorData authorData) {
JsonObject author = new JsonObject();
author.addProperty("oid", authorData.getOid());
author.addProperty("name", authorData.getName());
author.addProperty("surname", authorData.getSurname());
if (authorData.getCreditName() != null) {
author.addProperty("creditname", authorData.getCreditName());
}
return author.toString();
}
public static String create(WorkData workData) {
JsonObject work = new JsonObject();
work.addProperty("oid", workData.getOid());
work.addProperty("doi", workData.getDoi());
return work.toString();
}
}

View File

@ -0,0 +1,53 @@
package eu.dnetlib.doiboost.orcid.model;
import java.io.Serializable;
public class AuthorData implements Serializable {
private String oid;
private String name;
private String surname;
private String creditName;
private String errorCode;
public String getErrorCode() {
return errorCode;
}
public void setErrorCode(String errorCode) {
this.errorCode = errorCode;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSurname() {
return surname;
}
public void setSurname(String surname) {
this.surname = surname;
}
public String getCreditName() {
return creditName;
}
public void setCreditName(String creditName) {
this.creditName = creditName;
}
public String getOid() {
return oid;
}
public void setOid(String oid) {
this.oid = oid;
}
}

View File

@ -0,0 +1,76 @@
package eu.dnetlib.doiboost.orcid.model;
import java.io.Serializable;
import org.apache.hadoop.io.Text;
import com.google.gson.JsonObject;
import scala.Tuple2;
public class DownloadedRecordData implements Serializable {
private String orcidId;
private String modifiedDate;
private String statusCode;
private String compressedData;
private String errorMessage;
public Tuple2<String, String> toTuple2() {
JsonObject data = new JsonObject();
data.addProperty("statusCode", getStatusCode());
data.addProperty("modifiedDate", getModifiedDate());
if (getCompressedData() != null) {
data.addProperty("compressedData", getCompressedData());
}
if (getErrorMessage() != null) {
data.addProperty("errorMessage", getErrorMessage());
}
return new Tuple2<>(orcidId, data.toString());
}
public String getErrorMessage() {
return errorMessage;
}
public void setErrorMessage(String errorMessage) {
this.errorMessage = errorMessage;
}
public String getOrcidId() {
return orcidId;
}
public void setOrcidId(String orcidId) {
this.orcidId = orcidId;
}
public int getStatusCode() {
try {
return Integer.parseInt(statusCode);
} catch (Exception e) {
return -2;
}
}
public void setStatusCode(int statusCode) {
this.statusCode = Integer.toString(statusCode);
}
public String getCompressedData() {
return compressedData;
}
public void setCompressedData(String compressedData) {
this.compressedData = compressedData;
}
public String getModifiedDate() {
return modifiedDate;
}
public void setModifiedDate(String modifiedDate) {
this.modifiedDate = modifiedDate;
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.doiboost.orcid.model;
import java.io.Serializable;
public class WorkData implements Serializable {
private String oid;
private String doi;
private boolean doiFound = false;
public boolean isDoiFound() {
return doiFound;
}
public void setDoiFound(boolean doiFound) {
this.doiFound = doiFound;
}
public String getOid() {
return oid;
}
public void setOid(String oid) {
this.oid = oid;
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getErrorCode() {
return errorCode;
}
public void setErrorCode(String errorCode) {
this.errorCode = errorCode;
}
private String errorCode;
}

View File

@ -0,0 +1,123 @@
package eu.dnetlib.doiboost.orcid.xml;
import java.util.Arrays;
import java.util.List;
import com.ximpleware.AutoPilot;
import com.ximpleware.EOFException;
import com.ximpleware.EncodingException;
import com.ximpleware.EntityException;
import com.ximpleware.ParseException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
public class XMLRecordParser {
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
private static final String NS_COMMON = "common";
private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
private static final String NS_PERSON = "person";
private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
private static final String NS_DETAILS = "personal-details";
private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
private static final String NS_OTHER = "other-name";
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
private static final String NS_RECORD = "record";
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
private static final String NS_WORK = "work";
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_ERROR = "error";
public static AuthorData VTDParseAuthorData(byte[] bytes)
throws VtdException, EncodingException, EOFException, EntityException, ParseException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL);
ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL);
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
AuthorData authorData = new AuthorData();
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
if (!errors.isEmpty()) {
authorData.setErrorCode(errors.get(0));
return authorData;
}
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, "//record:record", Arrays.asList("path"));
if (!recordNodes.isEmpty()) {
final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1);
authorData.setOid(oid);
} else {
return null;
}
final List<String> names = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names");
if (!names.isEmpty()) {
authorData.setName(names.get(0));
}
final List<String> surnames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name");
if (!surnames.isEmpty()) {
authorData.setSurname(surnames.get(0));
}
final List<String> creditNames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name");
if (!creditNames.isEmpty()) {
authorData.setCreditName(creditNames.get(0));
}
return authorData;
}
public static WorkData VTDParseWorkData(byte[] bytes)
throws VtdException, EncodingException, EOFException, EntityException, ParseException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
WorkData workData = new WorkData();
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
if (!errors.isEmpty()) {
workData.setErrorCode(errors.get(0));
return workData;
}
List<VtdUtilityParser.Node> workNodes = VtdUtilityParser
.getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path"));
if (!workNodes.isEmpty()) {
final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1];
workData.setOid(oid);
} else {
return null;
}
final List<String> dois = VtdUtilityParser
.getTextValue(
ap, vn, "//common:external-id-type[text()=\"doi\"]/../common:external-id-value");
if (!dois.isEmpty()) {
workData.setDoi(dois.get(0));
workData.setDoiFound(true);
}
return workData;
}
}

Some files were not shown because too many files have changed in this diff Show More