forked from D-Net/dnet-hadoop
Merge branch 'beta' into crossref_mapping_improvement
This commit is contained in:
commit
302c4d044e
|
@ -10,6 +10,11 @@ public class Constants {
|
|||
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
||||
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
|
||||
|
||||
public static final String RAID_NS_PREFIX = "raid________";
|
||||
|
||||
public static final String END_DATE = "endDate";
|
||||
public static final String START_DATE = "startDate";
|
||||
|
||||
public static final String ROR_NS_PREFIX = "ror_________";
|
||||
|
||||
public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222";
|
||||
|
|
|
@ -21,7 +21,7 @@ public class CodeMatch extends AbstractStringComparator {
|
|||
public CodeMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
|
||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]+::\\d+"));
|
||||
}
|
||||
|
||||
public Set<String> getRegexList(String input) {
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.time.DateTimeException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.Period;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("dateRange")
|
||||
public class DateRange extends AbstractStringComparator {
|
||||
|
||||
int YEAR_RANGE;
|
||||
|
||||
public DateRange(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3"));
|
||||
}
|
||||
|
||||
public DateRange(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected DateRange(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
public static boolean isNumeric(String str) {
|
||||
return str.matches("\\d+"); // match a number with optional '-' and decimal.
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; // return -1 if a field is missing
|
||||
}
|
||||
|
||||
try {
|
||||
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
|
||||
LocalDate d1 = LocalDate.parse(a, formatter);
|
||||
LocalDate d2 = LocalDate.parse(b, formatter);
|
||||
Period period = Period.between(d1, d2);
|
||||
|
||||
return period.getYears() <= YEAR_RANGE ? 1.0 : 0.0;
|
||||
} catch (DateTimeException e) {
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -41,21 +41,38 @@ public class JsonListMatch extends AbstractListComparator {
|
|||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
switch (MODE) {
|
||||
case "count":
|
||||
return Sets.intersection(ca, cb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
case "percentage":
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
|
||||
case "type":
|
||||
Set<String> typesA = ca.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||
Set<String> typesB = cb.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||
|
||||
Set<String> types = Sets.intersection(typesA, typesB);
|
||||
|
||||
if (types.isEmpty()) // if no common type, it is impossible to compare
|
||||
return -1;
|
||||
|
||||
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||
cb = cb.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||
|
||||
return (double) Sets.intersection(ca, cb).size() / types.size();
|
||||
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (MODE.equals("percentage"))
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
|
||||
}
|
||||
|
||||
// converts every json into a comparable string basing on parameters
|
||||
|
@ -69,7 +86,7 @@ public class JsonListMatch extends AbstractListComparator {
|
|||
// for each path in the param list
|
||||
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||
String path = params.get(key);
|
||||
String value = MapDocumentUtil.getJPathString(path, documentContext);
|
||||
String value = MapDocumentUtil.getJPathString(path, documentContext).toLowerCase();
|
||||
if (value == null || value.isEmpty())
|
||||
value = "";
|
||||
st.append(value);
|
||||
|
|
|
@ -65,6 +65,43 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void datasetVersionCodeMatchTest() {
|
||||
|
||||
params.put("codeRegex", "(?=[\\w-]*[a-zA-Z])(?=[\\w-]*\\d)[\\w-]+");
|
||||
CodeMatch codeMatch = new CodeMatch(params);
|
||||
|
||||
// names have different codes
|
||||
assertEquals(
|
||||
0.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998 ev02a",
|
||||
"physical oceanography at ctd station june 1998 ir02", conf));
|
||||
|
||||
// names have same code
|
||||
assertEquals(
|
||||
1.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998 ev02a",
|
||||
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||
|
||||
// code is not in both names
|
||||
assertEquals(
|
||||
-1,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998",
|
||||
"physical oceanography at ctd station june 1998 ev02a", conf));
|
||||
assertEquals(
|
||||
1.0,
|
||||
codeMatch
|
||||
.distance(
|
||||
"physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998",
|
||||
conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void listContainsMatchTest() {
|
||||
|
||||
|
@ -257,15 +294,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
List<String> a = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
|
||||
"authors");
|
||||
List<String> b = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_2\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"isni\",\"classname\":\"ISNI Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"isni_1\"}"),
|
||||
"authors");
|
||||
|
||||
double result = jsonListMatch.compare(a, b, conf);
|
||||
|
@ -277,6 +314,13 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
result = jsonListMatch.compare(a, b, conf);
|
||||
|
||||
assertEquals(1.0, result);
|
||||
|
||||
params.put("mode", "type");
|
||||
jsonListMatch = new JsonListMatch(params);
|
||||
result = jsonListMatch.compare(a, b, conf);
|
||||
|
||||
assertEquals(0.5, result);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -327,6 +371,24 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void dateMatch() {
|
||||
|
||||
DateRange dateRange = new DateRange(params);
|
||||
|
||||
double result = dateRange.distance("2021-05-13", "2023-05-13", conf);
|
||||
assertEquals(1.0, result);
|
||||
|
||||
result = dateRange.distance("2021-05-13", "2025-05-13", conf);
|
||||
assertEquals(0.0, result);
|
||||
|
||||
result = dateRange.distance("", "2020-05-05", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = dateRange.distance("invalid date", "2021-05-02", conf);
|
||||
assertEquals(-1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void titleVersionMatchTest() {
|
||||
|
||||
|
|
|
@ -26,16 +26,16 @@
|
|||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-actionmanager</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-aggregation</artifactId>-->
|
||||
<!-- <artifactId>dhp-actionmanager</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-blacklist</artifactId>-->
|
||||
|
@ -56,61 +56,61 @@
|
|||
<!-- <artifactId>dhp-enrichment</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-mapper</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-provision</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-impact-indicators</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-actionsets</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-promote</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-swh</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-raw-data-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-stats-build</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-mapper</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-provision</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-impact-indicators</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-actionsets</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-hist-snaps</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-monitor-irish</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-promote</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-swh</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-raw-data-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-stats-build</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -135,22 +135,10 @@
|
|||
<arg>--outputPath</arg><arg>${workingDir}/action_payload_by_type</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="ForkPromote"/>
|
||||
<ok to="PromoteActionPayloadForDatasetTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="ForkPromote">
|
||||
<path start="PromoteActionPayloadForDatasetTable"/>
|
||||
<path start="PromoteActionPayloadForDatasourceTable"/>
|
||||
<path start="PromoteActionPayloadForOrganizationTable"/>
|
||||
<path start="PromoteActionPayloadForOtherResearchProductTable"/>
|
||||
<path start="PromoteActionPayloadForProjectTable"/>
|
||||
<path start="PromoteActionPayloadForPublicationTable"/>
|
||||
<path start="PromoteActionPayloadForRelationTable"/>
|
||||
<path start="PromoteActionPayloadForSoftwareTable"/>
|
||||
<path start="PromoteActionPayloadForPersonTable"/>
|
||||
</fork>
|
||||
|
||||
<action name="PromoteActionPayloadForDatasetTable">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/promote_action_payload_for_dataset_table</app-path>
|
||||
|
@ -162,7 +150,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForDatasourceTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -177,7 +165,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForOrganizationTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -192,7 +180,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForOtherResearchProductTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -207,7 +195,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForProjectTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -222,7 +210,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForPublicationTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -237,7 +225,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForRelationTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -252,7 +240,7 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="PromoteActionPayloadForSoftwareTable"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -267,26 +255,9 @@
|
|||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="PromoteActionPayloadForPersonTable">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/promote_action_payload_for_person_table</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>inputActionPayloadRootPath</name>
|
||||
<value>${workingDir}/action_payload_by_type</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="JoinPromote"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="JoinPromote" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -13,6 +13,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
|
|
@ -15,7 +15,7 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
|
@ -30,7 +30,6 @@ import org.apache.spark.sql.Dataset;
|
|||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.spark_project.jetty.util.StringUtil;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
|
@ -207,7 +206,7 @@ public class ExtractPerson implements Serializable {
|
|||
null);
|
||||
relation.setValidated(true);
|
||||
|
||||
if (StringUtil.isNotBlank(role)) {
|
||||
if (StringUtils.isNotBlank(role)) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("role");
|
||||
kv.setValue(role);
|
||||
|
@ -347,15 +346,19 @@ public class ExtractPerson implements Serializable {
|
|||
.structuredProperty(
|
||||
op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME,
|
||||
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES,
|
||||
OafMapperUtils.dataInfo(false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES),
|
||||
"0.91")));
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES),
|
||||
"0.91")));
|
||||
person.setDateofcollection(op.getLastModifiedDate());
|
||||
person.setOriginalId(Arrays.asList(op.getOrcid()));
|
||||
person.setDataInfo(ORCIDDATAINFO);
|
||||
|
@ -449,13 +452,13 @@ public class ExtractPerson implements Serializable {
|
|||
null);
|
||||
relation.setValidated(true);
|
||||
|
||||
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) {
|
||||
if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("startDate");
|
||||
kv.setValue(row.getStartDate());
|
||||
properties.add(kv);
|
||||
}
|
||||
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) {
|
||||
if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey("endDate");
|
||||
kv.setValue(row.getEndDate());
|
||||
|
|
|
@ -0,0 +1,203 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID;
|
||||
import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME;
|
||||
import static eu.dnetlib.dhp.common.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateRAiDActionSetJob {
|
||||
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final List<KeyValue> RAID_COLLECTED_FROM = listKeyValues(
|
||||
OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
|
||||
private static final Qualifier RAID_QUALIFIER = qualifier(
|
||||
"0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE);
|
||||
|
||||
private static final Qualifier RAID_INFERENCE_QUALIFIER = qualifier(
|
||||
"raid:openaireinference", "Inferred by OpenAIRE", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS);
|
||||
|
||||
private static final DataInfo RAID_DATA_INFO = dataInfo(
|
||||
false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_INFERENCE_QUALIFIER, "0.92");
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
processRAiDEntities(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void removeOutputDir(final SparkSession spark, final String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
static void processRAiDEntities(final SparkSession spark,
|
||||
final String inputPath,
|
||||
final String outputPath) {
|
||||
readInputPath(spark, inputPath)
|
||||
.map(GenerateRAiDActionSetJob::prepareRAiD)
|
||||
.flatMap(List::iterator)
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
|
||||
}
|
||||
|
||||
protected static List<AtomicAction<? extends Oaf>> prepareRAiD(final RAiDEntity r) {
|
||||
|
||||
final Date now = new Date();
|
||||
final OtherResearchProduct orp = new OtherResearchProduct();
|
||||
final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
|
||||
String raidId = calculateOpenaireId(r.getRaid());
|
||||
|
||||
orp.setId(raidId);
|
||||
orp.setCollectedfrom(RAID_COLLECTED_FROM);
|
||||
orp.setDataInfo(RAID_DATA_INFO);
|
||||
orp
|
||||
.setTitle(
|
||||
Collections
|
||||
.singletonList(
|
||||
structuredProperty(
|
||||
r.getTitle(),
|
||||
qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE),
|
||||
RAID_DATA_INFO)));
|
||||
orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary()));
|
||||
|
||||
Instance instance = new Instance();
|
||||
instance.setInstancetype(RAID_QUALIFIER);
|
||||
orp.setInstance(Collections.singletonList(instance));
|
||||
orp
|
||||
.setSubject(
|
||||
r
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.map(
|
||||
s -> subject(
|
||||
s,
|
||||
qualifier(
|
||||
DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES,
|
||||
DNET_SUBJECT_TYPOLOGIES),
|
||||
RAID_DATA_INFO))
|
||||
.collect(Collectors.toList()));
|
||||
orp
|
||||
.setRelevantdate(
|
||||
Arrays
|
||||
.asList(
|
||||
structuredProperty(
|
||||
r.getEndDate(), qualifier(END_DATE, END_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE),
|
||||
RAID_DATA_INFO),
|
||||
structuredProperty(
|
||||
r.getStartDate(),
|
||||
qualifier(START_DATE, START_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE),
|
||||
RAID_DATA_INFO)));
|
||||
orp.setLastupdatetimestamp(now.getTime());
|
||||
orp.setDateofacceptance(field(r.getStartDate(), RAID_DATA_INFO));
|
||||
|
||||
res.add(new AtomicAction<>(OtherResearchProduct.class, orp));
|
||||
|
||||
for (String resultId : r.getIds()) {
|
||||
Relation rel1 = OafMapperUtils
|
||||
.getRelation(
|
||||
raidId,
|
||||
resultId,
|
||||
ModelConstants.RESULT_RESULT,
|
||||
PART,
|
||||
HAS_PART,
|
||||
orp);
|
||||
Relation rel2 = OafMapperUtils
|
||||
.getRelation(
|
||||
resultId,
|
||||
raidId,
|
||||
ModelConstants.RESULT_RESULT,
|
||||
PART,
|
||||
IS_PART_OF,
|
||||
orp);
|
||||
res.add(new AtomicAction<>(Relation.class, rel1));
|
||||
res.add(new AtomicAction<>(Relation.class, rel2));
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static String calculateOpenaireId(final String raid) {
|
||||
return String.format("50|%s::%s", RAID_NS_PREFIX, DHPUtils.md5(raid));
|
||||
}
|
||||
|
||||
public static List<Author> createAuthors(final List<String> author) {
|
||||
return author.stream().map(s -> {
|
||||
Author a = new Author();
|
||||
a.setFullname(s);
|
||||
return a;
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static JavaRDD<RAiDEntity> readInputPath(
|
||||
final SparkSession spark,
|
||||
final String path) {
|
||||
|
||||
return spark
|
||||
.read()
|
||||
.json(path)
|
||||
.as(Encoders.bean(RAiDEntity.class))
|
||||
.toJavaRDD();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid.model;
|
||||
|
||||
public class GenerateRAiDActionSetJob {
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class RAiDEntity implements Serializable {
|
||||
|
||||
String raid;
|
||||
List<String> authors;
|
||||
String startDate;
|
||||
String endDate;
|
||||
List<String> subjects;
|
||||
List<String> titles;
|
||||
List<String> ids;
|
||||
String title;
|
||||
String summary;
|
||||
|
||||
public RAiDEntity() {
|
||||
}
|
||||
|
||||
public RAiDEntity(String raid, List<String> authors, String startDate, String endDate, List<String> subjects,
|
||||
List<String> titles, List<String> ids, String title, String summary) {
|
||||
this.raid = raid;
|
||||
this.authors = authors;
|
||||
this.startDate = startDate;
|
||||
this.endDate = endDate;
|
||||
this.subjects = subjects;
|
||||
this.titles = titles;
|
||||
this.ids = ids;
|
||||
this.title = title;
|
||||
this.summary = summary;
|
||||
}
|
||||
|
||||
public String getRaid() {
|
||||
return raid;
|
||||
}
|
||||
|
||||
public void setRaid(String raid) {
|
||||
this.raid = raid;
|
||||
}
|
||||
|
||||
public List<String> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<String> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public String getStartDate() {
|
||||
return startDate;
|
||||
}
|
||||
|
||||
public void setStartDate(String startDate) {
|
||||
this.startDate = startDate;
|
||||
}
|
||||
|
||||
public String getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public void setEndDate(String endDate) {
|
||||
this.endDate = endDate;
|
||||
}
|
||||
|
||||
public List<String> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(List<String> subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public List<String> getTitles() {
|
||||
return titles;
|
||||
}
|
||||
|
||||
public void setTitles(List<String> titles) {
|
||||
this.titles = titles;
|
||||
}
|
||||
|
||||
public List<String> getIds() {
|
||||
return ids;
|
||||
}
|
||||
|
||||
public void setIds(List<String> ids) {
|
||||
this.ids = ids;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getSummary() {
|
||||
return summary;
|
||||
}
|
||||
|
||||
public void setSummary(String summary) {
|
||||
this.summary = summary;
|
||||
}
|
||||
}
|
|
@ -44,13 +44,7 @@ import eu.dnetlib.dhp.common.Constants;
|
|||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
|
|
@ -154,7 +154,6 @@ public class ORCIDExtractor extends Thread {
|
|||
extractedItem++;
|
||||
if (extractedItem % 100000 == 0) {
|
||||
log.info("Thread {}: Extracted {} items", id, extractedItem);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.gtr2;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.LocalDate;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
@ -16,9 +19,6 @@ import org.dom4j.Document;
|
|||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Element;
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -33,7 +33,7 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
private static final Logger log = LoggerFactory.getLogger(Gtr2PublicationsIterator.class);
|
||||
|
||||
private final HttpConnector2 connector;
|
||||
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");
|
||||
|
||||
private static final int MAX_ATTEMPTS = 10;
|
||||
|
||||
|
@ -41,7 +41,7 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
private int currPage;
|
||||
private int endPage;
|
||||
private boolean incremental = false;
|
||||
private DateTime fromDate;
|
||||
private LocalDate fromDate;
|
||||
|
||||
private final Map<String, String> cache = new HashMap<>();
|
||||
|
||||
|
@ -188,28 +188,28 @@ public class Gtr2PublicationsIterator implements Iterator<String> {
|
|||
|
||||
private Document loadURL(final String cleanUrl, final int attempt) {
|
||||
try {
|
||||
log.debug(" * Downloading Url: " + cleanUrl);
|
||||
final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes("UTF-8");
|
||||
log.debug(" * Downloading Url: {}", cleanUrl);
|
||||
final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes(StandardCharsets.UTF_8);
|
||||
return DocumentHelper.parseText(new String(bytes));
|
||||
} catch (final Throwable e) {
|
||||
log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e);
|
||||
log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e);
|
||||
if (attempt >= MAX_ATTEMPTS) {
|
||||
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
|
||||
throw new RuntimeException("Error downloading url: " + cleanUrl, e);
|
||||
}
|
||||
try {
|
||||
Thread.sleep(60000); // I wait for a minute
|
||||
} catch (final InterruptedException e1) {
|
||||
throw new RuntimeException("Error dowloading url: " + cleanUrl, e);
|
||||
throw new RuntimeException("Error downloading url: " + cleanUrl, e);
|
||||
}
|
||||
return loadURL(cleanUrl, attempt + 1);
|
||||
}
|
||||
}
|
||||
|
||||
private DateTime parseDate(final String s) {
|
||||
return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
|
||||
private LocalDate parseDate(final String s) {
|
||||
return LocalDate.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter);
|
||||
}
|
||||
|
||||
private boolean isAfter(final String d, final DateTime fromDate) {
|
||||
private boolean isAfter(final String d, final LocalDate fromDate) {
|
||||
return StringUtils.isNotBlank(d) && parseDate(d).isAfter(fromDate);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ import java.util.Queue;
|
|||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.http.Header;
|
||||
|
@ -27,25 +27,25 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
|
||||
private final String baseUrl;
|
||||
private final String authToken;
|
||||
private int currPage;
|
||||
private int nPages;
|
||||
private String nextUrl;
|
||||
private int nCalls = 0;
|
||||
|
||||
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
||||
|
||||
public ResearchFiIterator(final String baseUrl, final String authToken) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.authToken = authToken;
|
||||
this.currPage = 0;
|
||||
this.nPages = 0;
|
||||
this.nextUrl = null;
|
||||
}
|
||||
|
||||
private void verifyStarted() {
|
||||
if (this.currPage == 0) {
|
||||
try {
|
||||
nextCall();
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
|
||||
try {
|
||||
if (this.nCalls == 0) {
|
||||
this.nextUrl = invokeUrl(this.baseUrl);
|
||||
}
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -62,9 +62,9 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
synchronized (this.queue) {
|
||||
verifyStarted();
|
||||
final String res = this.queue.poll();
|
||||
while (this.queue.isEmpty() && (this.currPage < this.nPages)) {
|
||||
while (this.queue.isEmpty() && StringUtils.isNotBlank(this.nextUrl)) {
|
||||
try {
|
||||
nextCall();
|
||||
this.nextUrl = invokeUrl(this.nextUrl);
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
|
@ -73,18 +73,11 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
}
|
||||
}
|
||||
|
||||
private void nextCall() throws CollectorException {
|
||||
private String invokeUrl(final String url) throws CollectorException {
|
||||
|
||||
this.currPage += 1;
|
||||
this.nCalls += 1;
|
||||
String next = null;
|
||||
|
||||
final String url;
|
||||
if (!this.baseUrl.contains("?")) {
|
||||
url = String.format("%s?PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
|
||||
} else if (!this.baseUrl.contains("PageSize=")) {
|
||||
url = String.format("%s&PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
|
||||
} else {
|
||||
url = String.format("%s&PageNumber=%d", this.baseUrl, this.currPage);
|
||||
}
|
||||
log.info("Calling url: " + url);
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
|
@ -94,11 +87,15 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
for (final Header header : response.getAllHeaders()) {
|
||||
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
|
||||
if ("x-page-count".equals(header.getName())) {
|
||||
final int totalPages = NumberUtils.toInt(header.getValue());
|
||||
if (this.nPages != totalPages) {
|
||||
this.nPages = NumberUtils.toInt(header.getValue());
|
||||
log.info("Total pages: " + totalPages);
|
||||
if ("link".equals(header.getName())) {
|
||||
final String s = StringUtils.substringBetween(header.getValue(), "<", ">");
|
||||
final String token = StringUtils
|
||||
.substringBefore(StringUtils.substringAfter(s, "NextPageToken="), "&");
|
||||
|
||||
if (this.baseUrl.contains("?")) {
|
||||
next = this.baseUrl + "&NextPageToken=" + token;
|
||||
} else {
|
||||
next = this.baseUrl + "?NextPageToken=" + token;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -108,6 +105,9 @@ public class ResearchFiIterator implements Iterator<String> {
|
|||
|
||||
jsonArray.forEach(obj -> this.queue.add(JsonUtils.convertToXML(obj.toString())));
|
||||
}
|
||||
|
||||
return next;
|
||||
|
||||
} catch (final Throwable e) {
|
||||
log.warn("Error calling url: " + url, e);
|
||||
throw new CollectorException("Error calling url: " + url, e);
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
/**
|
||||
* The type Pubmed Affiliation.
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
public class PMAffiliation {
|
||||
|
||||
private String name;
|
||||
|
||||
private PMIdentifier identifier;
|
||||
|
||||
public PMAffiliation() {
|
||||
|
||||
}
|
||||
|
||||
public PMAffiliation(String name, PMIdentifier identifier) {
|
||||
this.name = name;
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public PMIdentifier getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(PMIdentifier identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
}
|
|
@ -8,259 +8,115 @@ import java.util.List;
|
|||
/**
|
||||
* This class represent an instance of Pubmed Article extracted from the native XML
|
||||
*
|
||||
* @author Sandro La Bruzzo
|
||||
*/
|
||||
|
||||
public class PMArticle implements Serializable {
|
||||
|
||||
/**
|
||||
* the Pubmed Identifier
|
||||
*/
|
||||
private String pmid;
|
||||
|
||||
private String pmcId;
|
||||
|
||||
/**
|
||||
* the DOI
|
||||
*/
|
||||
private String doi;
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*/
|
||||
private String date;
|
||||
/**
|
||||
* This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
|
||||
*/
|
||||
private PMJournal journal;
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*/
|
||||
private String title;
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*/
|
||||
private String description;
|
||||
/**
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*/
|
||||
private String language;
|
||||
|
||||
/**
|
||||
* NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. *
|
||||
*/
|
||||
private final List<PMSubject> subjects = new ArrayList<>();
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*/
|
||||
private final List<PMSubject> publicationTypes = new ArrayList<>();
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*/
|
||||
private List<PMSubject> subjects;
|
||||
private List<PMSubject> publicationTypes = new ArrayList<>();
|
||||
private List<PMAuthor> authors = new ArrayList<>();
|
||||
private List<PMGrant> grants = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
*/
|
||||
private final List<PMGrant> grants = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* get the DOI
|
||||
* @return a DOI
|
||||
*/
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the DOI
|
||||
* @param doi a DOI
|
||||
*/
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the Pubmed Identifier
|
||||
* @return the PMID
|
||||
*/
|
||||
public String getPmid() {
|
||||
return pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the Pubmed Identifier
|
||||
* @param pmid the Pubmed Identifier
|
||||
*/
|
||||
public void setPmid(String pmid) {
|
||||
this.pmid = pmid;
|
||||
}
|
||||
|
||||
/**
|
||||
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
|
||||
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
|
||||
*
|
||||
* @return the Pubmed Date
|
||||
*/
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the pubmed Date
|
||||
* @param date
|
||||
*/
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
/**
|
||||
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
|
||||
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
|
||||
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
|
||||
*
|
||||
* @return the pubmed Journal Extracted
|
||||
*/
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the mapped pubmed Journal
|
||||
* @param journal
|
||||
*/
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
/**
|
||||
* <ArticleTitle> contains the entire title of the journal article. <ArticleTitle> is always in English;
|
||||
* those titles originally published in a non-English language and translated for <ArticleTitle> are enclosed in square brackets.
|
||||
* All titles end with a period unless another punctuation mark such as a question mark or bracket is present.
|
||||
* Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl).
|
||||
* Corporate/collective authors may appear at the end of <ArticleTitle> for citations up to about the year 2000.
|
||||
*
|
||||
* @return the extracted pubmed Title
|
||||
*/
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
* set the pubmed title
|
||||
* @param title
|
||||
*/
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
/**
|
||||
* English-language abstracts are taken directly from the published article.
|
||||
* If the article does not have a published abstract, the National Library of Medicine does not create one,
|
||||
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
|
||||
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
|
||||
*
|
||||
* @return the Mapped Pubmed Article Abstracts
|
||||
*/
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Pubmed Article Abstracts
|
||||
* @param description
|
||||
*/
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
|
||||
*
|
||||
* @return get the Mapped Authors lists
|
||||
*/
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Mapped Authors lists
|
||||
* @param authors
|
||||
*/
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Subjects
|
||||
*/
|
||||
public List<PMSubject> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* the language in which an article was published is recorded in <Language>.
|
||||
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
|
||||
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
|
||||
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
|
||||
*
|
||||
* @return The mapped Language
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Set The mapped Language
|
||||
*
|
||||
* @param language the mapped Language
|
||||
*/
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
/**
|
||||
* This element is used to identify the type of article indexed for MEDLINE;
|
||||
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
|
||||
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
|
||||
*
|
||||
* @return the mapped Publication Type
|
||||
*/
|
||||
public List<PMSubject> getPublicationTypes() {
|
||||
return publicationTypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
|
||||
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
|
||||
* @return the mapped grants
|
||||
*/
|
||||
|
||||
public List<PMGrant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public String getPmcId() {
|
||||
return pmcId;
|
||||
}
|
||||
|
||||
public PMArticle setPmcId(String pmcId) {
|
||||
public void setPmcId(String pmcId) {
|
||||
this.pmcId = pmcId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public void setLanguage(String language) {
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
public List<PMSubject> getSubjects() {
|
||||
return subjects;
|
||||
}
|
||||
|
||||
public void setSubjects(List<PMSubject> subjects) {
|
||||
this.subjects = subjects;
|
||||
}
|
||||
|
||||
public List<PMSubject> getPublicationTypes() {
|
||||
return publicationTypes;
|
||||
}
|
||||
|
||||
public void setPublicationTypes(List<PMSubject> publicationTypes) {
|
||||
this.publicationTypes = publicationTypes;
|
||||
}
|
||||
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public List<PMGrant> getGrants() {
|
||||
return grants;
|
||||
}
|
||||
|
||||
public void setGrants(List<PMGrant> grants) {
|
||||
this.grants = grants;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,6 +12,8 @@ public class PMAuthor implements Serializable {
|
|||
|
||||
private String lastName;
|
||||
private String foreName;
|
||||
private PMIdentifier identifier;
|
||||
private PMAffiliation affiliation;
|
||||
|
||||
/**
|
||||
* Gets last name.
|
||||
|
@ -59,4 +61,40 @@ public class PMAuthor implements Serializable {
|
|||
.format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets identifier.
|
||||
*
|
||||
* @return the identifier
|
||||
*/
|
||||
public PMIdentifier getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets identifier.
|
||||
*
|
||||
* @param identifier the identifier
|
||||
*/
|
||||
public void setIdentifier(PMIdentifier identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets affiliation.
|
||||
*
|
||||
* @return the affiliation
|
||||
*/
|
||||
public PMAffiliation getAffiliation() {
|
||||
return affiliation;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets affiliation.
|
||||
*
|
||||
* @param affiliation the affiliation
|
||||
*/
|
||||
public void setAffiliation(PMAffiliation affiliation) {
|
||||
this.affiliation = affiliation;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed;
|
||||
|
||||
public class PMIdentifier {
|
||||
|
||||
private String pid;
|
||||
private String type;
|
||||
|
||||
public PMIdentifier(String pid, String type) {
|
||||
this.pid = cleanPid(pid);
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public PMIdentifier() {
|
||||
|
||||
}
|
||||
|
||||
private String cleanPid(String pid) {
|
||||
|
||||
if (pid == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705
|
||||
if (pid.matches("[0-9]{15}[0-9X]")) {
|
||||
return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4");
|
||||
}
|
||||
|
||||
// clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543
|
||||
if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) {
|
||||
return pid.replaceAll("http://orcid.org/", "");
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
|
||||
public String getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public PMIdentifier setPid(String pid) {
|
||||
this.pid = cleanPid(pid);
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public PMIdentifier setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path of the input json",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,53 @@
|
|||
<workflow-app name="Update_RAiD_action_set" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>raidJsonInputPath</name>
|
||||
<description>the path of the json</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>raidActionSetPath</name>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="deleteoutputpath">
|
||||
<fs>
|
||||
<delete path='${raidActionSetPath}'/>
|
||||
<mkdir path='${raidActionSetPath}'/>
|
||||
</fs>
|
||||
<ok to="processRAiDFile"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="processRAiDFile">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ProcessRAiDFile</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${raidJsonInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${raidActionSetPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,8 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false},
|
||||
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true},
|
||||
{"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the mdstore path to save", "paramRequired": true}
|
||||
|
||||
]
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>baselineWorkingPath</name>
|
||||
|
@ -16,11 +16,6 @@
|
|||
<name>mdStoreManagerURI</name>
|
||||
<description>the path of the cleaned mdstore</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>skipUpdate</name>
|
||||
<value>false</value>
|
||||
<description>The request block size</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="StartTransaction"/>
|
||||
|
@ -44,16 +39,16 @@
|
|||
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
|
||||
<capture-output/>
|
||||
</java>
|
||||
<ok to="ConvertDataset"/>
|
||||
<ok to="TransformPubMed"/>
|
||||
<error to="RollBack"/>
|
||||
</action>
|
||||
|
||||
<action name="ConvertDataset">
|
||||
<action name="TransformPubMed">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Baseline to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
|
||||
<name>Convert Baseline Pubmed to OAF Dataset</name>
|
||||
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
|
@ -65,12 +60,10 @@
|
|||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${baselineWorkingPath}</arg>
|
||||
<arg>--sourcePath</arg><arg>${baselineWorkingPath}</arg>
|
||||
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
|
||||
</spark>
|
||||
<ok to="CommitVersion"/>
|
||||
<error to="RollBack"/>
|
||||
|
|
|
@ -667,7 +667,6 @@ case object Crossref2Oaf {
|
|||
val doi = input.getString(0)
|
||||
val rorId = input.getString(1)
|
||||
|
||||
|
||||
val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true)
|
||||
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
|
||||
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.common.Constants
|
||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser2, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.transformation.TransformSparkJobNode
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||
val mapper = new ObjectMapper()
|
||||
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info(s"outputBasePath is '$outputBasePath'")
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
|
||||
createPubmedDump(spark, sourcePath, outputBasePath, vocabularies)
|
||||
|
||||
}
|
||||
|
||||
/** This method creates a dump of the pubmed articles
|
||||
* @param spark the spark session
|
||||
* @param sourcePath the path of the source file
|
||||
* @param targetPath the path of the target file
|
||||
* @param vocabularies the vocabularies
|
||||
*/
|
||||
def createPubmedDump(
|
||||
spark: SparkSession,
|
||||
sourcePath: String,
|
||||
targetPath: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): Unit = {
|
||||
require(spark != null)
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.bean(classOf[PMArticle])
|
||||
|
||||
import spark.implicits._
|
||||
val df = spark.read.option("lineSep", "</PubmedArticle>").text(sourcePath)
|
||||
val mapper = new ObjectMapper()
|
||||
df.as[String]
|
||||
.map(s => {
|
||||
val id = s.indexOf("<PubmedArticle>")
|
||||
if (id >= 0) s"${s.substring(id)}</PubmedArticle>" else null
|
||||
})
|
||||
.filter(s => s != null)
|
||||
.map { i =>
|
||||
//remove try catch
|
||||
try {
|
||||
new PMParser2().parse(i)
|
||||
} catch {
|
||||
case _: Exception => {
|
||||
throw new RuntimeException(s"Error parsing article: $i")
|
||||
}
|
||||
}
|
||||
}
|
||||
.dropDuplicates("pmid")
|
||||
.map { a =>
|
||||
val oaf = PubMedToOaf.convert(a, vocabularies)
|
||||
if (oaf != null)
|
||||
mapper.writeValueAsString(oaf)
|
||||
else
|
||||
null
|
||||
}
|
||||
.as[String]
|
||||
.filter(s => s != null)
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
.mode("overwrite")
|
||||
.text(targetPath + MDSTORE_DATA_PATH)
|
||||
|
||||
val mdStoreSize = spark.read.text(targetPath + MDSTORE_DATA_PATH).count
|
||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, "" + mdStoreSize, targetPath + MDSTORE_SIZE_PATH)
|
||||
}
|
||||
}
|
||||
|
||||
object SparkCreatePubmedDump {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
new SparkCreatePubmedDump("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json", args, log).initialize().run()
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,277 @@
|
|||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
|
||||
import javax.xml.stream.XMLEventReader
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.xml.{MetaData, NodeSeq}
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText}
|
||||
|
||||
class PMParser2 {
|
||||
|
||||
/** Extracts the value of an attribute from a MetaData object.
|
||||
* @param attrs the MetaData object
|
||||
* @param key the key of the attribute
|
||||
* @return the value of the attribute or null if the attribute is not found
|
||||
*/
|
||||
private def extractAttributes(attrs: MetaData, key: String): String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s = res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
} else null
|
||||
}
|
||||
|
||||
/** Validates and formats a date given the year, month, and day as strings.
|
||||
*
|
||||
* @param year the year as a string
|
||||
* @param month the month as a string
|
||||
* @param day the day as a string
|
||||
* @return the formatted date as "YYYY-MM-DD" or null if the date is invalid
|
||||
*/
|
||||
private def validate_Date(year: String, month: String, day: String): String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable => null
|
||||
}
|
||||
}
|
||||
|
||||
/** Extracts the grant information from a NodeSeq object.
|
||||
*
|
||||
* @param gNode the NodeSeq object
|
||||
* @return the grant information or an empty list if the grant information is not found
|
||||
*/
|
||||
private def extractGrant(gNode: NodeSeq): List[PMGrant] = {
|
||||
gNode
|
||||
.map(node => {
|
||||
val grantId = (node \ "GrantID").text
|
||||
val agency = (node \ "Agency").text
|
||||
val country = (node \ "Country").text
|
||||
new PMGrant(grantId, agency, country)
|
||||
})
|
||||
.toList
|
||||
}
|
||||
|
||||
/** Extracts the journal information from a NodeSeq object.
|
||||
*
|
||||
* @param jNode the NodeSeq object
|
||||
* @return the journal information or null if the journal information is not found
|
||||
*/
|
||||
private def extractJournal(jNode: NodeSeq): PMJournal = {
|
||||
val journal = new PMJournal
|
||||
journal.setTitle((jNode \ "Title").text)
|
||||
journal.setIssn((jNode \ "ISSN").text)
|
||||
journal.setVolume((jNode \ "JournalIssue" \ "Volume").text)
|
||||
journal.setIssue((jNode \ "JournalIssue" \ "Issue").text)
|
||||
if (journal.getTitle != null && StringUtils.isNotEmpty(journal.getTitle))
|
||||
journal
|
||||
else
|
||||
null
|
||||
}
|
||||
|
||||
private def extractAuthors(aNode: NodeSeq): List[PMAuthor] = {
|
||||
aNode
|
||||
.map(author => {
|
||||
val a = new PMAuthor
|
||||
a.setLastName((author \ "LastName").text)
|
||||
a.setForeName((author \ "ForeName").text)
|
||||
val id = (author \ "Identifier").text
|
||||
val idType = (author \ "Identifier" \ "@Source").text
|
||||
|
||||
if (id != null && id.nonEmpty && idType != null && idType.nonEmpty) {
|
||||
a.setIdentifier(new PMIdentifier(id, idType))
|
||||
}
|
||||
|
||||
val affiliation = (author \ "AffiliationInfo" \ "Affiliation").text
|
||||
val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text
|
||||
val affiliationIdType = (author \ "AffiliationInfo" \ "Identifier" \ "@Source").text
|
||||
|
||||
if (affiliation != null && affiliation.nonEmpty) {
|
||||
val aff = new PMAffiliation()
|
||||
aff.setName(affiliation)
|
||||
if (
|
||||
affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty
|
||||
) {
|
||||
aff.setIdentifier(new PMIdentifier(affiliationId, affiliationIdType))
|
||||
}
|
||||
a.setAffiliation(aff)
|
||||
}
|
||||
a
|
||||
})
|
||||
.toList
|
||||
}
|
||||
|
||||
def parse(input: String): PMArticle = {
|
||||
val xml = scala.xml.XML.loadString(input)
|
||||
val article = new PMArticle
|
||||
|
||||
val grantNodes = xml \ "MedlineCitation" \\ "Grant"
|
||||
article.setGrants(extractGrant(grantNodes).asJava)
|
||||
|
||||
val journal = xml \ "MedlineCitation" \ "Article" \ "Journal"
|
||||
article.setJournal(extractJournal(journal))
|
||||
|
||||
val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author"
|
||||
|
||||
article.setAuthors(
|
||||
extractAuthors(authors).asJava
|
||||
)
|
||||
|
||||
val pmId = xml \ "MedlineCitation" \ "PMID"
|
||||
|
||||
val articleIds = xml \ "PubmedData" \ "ArticleIdList" \ "ArticleId"
|
||||
articleIds.foreach(articleId => {
|
||||
val idType = (articleId \ "@IdType").text
|
||||
val id = articleId.text
|
||||
if ("doi".equalsIgnoreCase(idType)) article.setDoi(id)
|
||||
if ("pmc".equalsIgnoreCase(idType)) article.setPmcId(id)
|
||||
})
|
||||
article.setPmid(pmId.text)
|
||||
|
||||
val pubMedPubDate = xml \ "MedlineCitation" \ "DateCompleted"
|
||||
val currentDate =
|
||||
validate_Date((pubMedPubDate \ "Year").text, (pubMedPubDate \ "Month").text, (pubMedPubDate \ "Day").text)
|
||||
if (currentDate != null) article.setDate(currentDate)
|
||||
|
||||
val articleTitle = xml \ "MedlineCitation" \ "Article" \ "ArticleTitle"
|
||||
article.setTitle(articleTitle.text)
|
||||
|
||||
val abstractText = xml \ "MedlineCitation" \ "Article" \ "Abstract" \ "AbstractText"
|
||||
if (abstractText != null && abstractText.text != null && abstractText.text.nonEmpty)
|
||||
article.setDescription(abstractText.text.split("\n").map(s => s.trim).mkString(" ").trim)
|
||||
|
||||
val language = xml \ "MedlineCitation" \ "Article" \ "Language"
|
||||
article.setLanguage(language.text)
|
||||
|
||||
val subjects = xml \ "MedlineCitation" \ "MeshHeadingList" \ "MeshHeading"
|
||||
article.setSubjects(
|
||||
subjects
|
||||
.take(20)
|
||||
.map(subject => {
|
||||
val descriptorName = (subject \ "DescriptorName").text
|
||||
val ui = (subject \ "DescriptorName" \ "@UI").text
|
||||
val s = new PMSubject
|
||||
s.setValue(descriptorName)
|
||||
s.setMeshId(ui)
|
||||
s
|
||||
})
|
||||
.toList
|
||||
.asJava
|
||||
)
|
||||
val publicationTypes = xml \ "MedlineCitation" \ "Article" \ "PublicationTypeList" \ "PublicationType"
|
||||
article.setPublicationTypes(
|
||||
publicationTypes
|
||||
.map(pt => {
|
||||
val s = new PMSubject
|
||||
s.setValue(pt.text)
|
||||
s
|
||||
})
|
||||
.toList
|
||||
.asJava
|
||||
)
|
||||
|
||||
article
|
||||
}
|
||||
|
||||
def parse2(xml: XMLEventReader): PMArticle = {
|
||||
var currentArticle: PMArticle = null
|
||||
var currentSubject: PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType: String = null
|
||||
|
||||
while (xml.hasNext) {
|
||||
val ne = xml.next
|
||||
ne match {
|
||||
case EvElemStart(_, label, attrs, _) =>
|
||||
currNode = label
|
||||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" =>
|
||||
if (currentArticle.getDate == null)
|
||||
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode != null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle == null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription == null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" =>
|
||||
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume(text.trim)
|
||||
case "Issue" => currentJournal.setIssue(text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" =>
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle == null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
}
|
|
@ -294,6 +294,24 @@ object PubMedToOaf {
|
|||
author.setName(a.getForeName)
|
||||
author.setSurname(a.getLastName)
|
||||
author.setFullname(a.getFullName)
|
||||
if (a.getIdentifier != null) {
|
||||
author.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
a.getIdentifier.getPid,
|
||||
OafMapperUtils.qualifier(
|
||||
a.getIdentifier.getType,
|
||||
a.getIdentifier.getType,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
),
|
||||
dataInfo
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
}
|
||||
if (a.getAffiliation != null)
|
||||
author.setRawAffiliationString(List(a.getAffiliation.getName).asJava)
|
||||
author.setRank(index + 1)
|
||||
author
|
||||
}(collection.breakOut)
|
||||
|
|
|
@ -0,0 +1,165 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.raid;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest;
|
||||
import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateRAiDActionSetJobTest {
|
||||
private static String input_path;
|
||||
private static String output_path;
|
||||
static SparkSession spark;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
|
||||
input_path = Paths
|
||||
.get(
|
||||
GenerateRAiDActionSetJobTest.class
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json")
|
||||
.toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-")
|
||||
.toAbsolutePath()
|
||||
.toString();
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", output_path);
|
||||
conf.set("hive.metastore.warehouse.dir", output_path);
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(GenerateRAiDActionSetJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
static void cleanUp() throws Exception {
|
||||
FileUtils.deleteDirectory(new File(output_path));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void testProcessRAiDEntities() {
|
||||
GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set");
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<? extends Oaf> result = sc
|
||||
.sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(AtomicAction::getPayload);
|
||||
|
||||
assertEquals(80, result.count());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPrepareRAiD() {
|
||||
|
||||
List<AtomicAction<? extends Oaf>> atomicActions = GenerateRAiDActionSetJob
|
||||
.prepareRAiD(
|
||||
new RAiDEntity(
|
||||
"-92190526",
|
||||
Arrays
|
||||
.asList(
|
||||
"Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura",
|
||||
"Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume",
|
||||
"Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont",
|
||||
"Maïeul GRUGET", "Cécile Duchêne"),
|
||||
"2021-09-10",
|
||||
"2024-02-16",
|
||||
Arrays
|
||||
.asList(
|
||||
"cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps",
|
||||
"pan-scalar map", "Python library", "QGIS", "map design", "landmarks",
|
||||
"Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]",
|
||||
"[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography",
|
||||
"eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency",
|
||||
"General Medicine", "Geography, Planning and Development", "multi-scales",
|
||||
"pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences",
|
||||
"progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design",
|
||||
"cartography, map generalisation, zoom, multi-scale map", "Interactive maps",
|
||||
"Map generalisation", "Earth and Planetary Sciences (miscellaneous)",
|
||||
"Cartographic generalization", "rivers", "Benchmark", "General Environmental Science",
|
||||
"open source", "drawing", "Constraint", "Multi-scale maps"),
|
||||
Arrays
|
||||
.asList(
|
||||
"Where do people look at during multi-scale map tasks?", "FogDetector survey raw data",
|
||||
"Collection of cartographic disorientation stories", "Anchorwhat dataset",
|
||||
"BasqueRoads: A Benchmark for Road Network Selection",
|
||||
"Progressive river network selection for pan-scalar maps",
|
||||
"BasqueRoads, a dataset to benchmark road selection algorithms",
|
||||
"Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps",
|
||||
"Empirical approach to advance the generalisation of multi-scale maps",
|
||||
"L'Alpe d'Huez: a dataset to benchmark topographic map generalisation",
|
||||
"eye-tracking data from a survey on zooming in a pan-scalar map",
|
||||
"Material of the experiment 'More is Less' from the MapMuxing project",
|
||||
"Cartagen4py, an open source Python library for map generalisation",
|
||||
"L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"),
|
||||
Arrays
|
||||
.asList(
|
||||
"50|doi_dedup___::6915135e0aa39f913394513f809ae58a",
|
||||
"50|doi_dedup___::754e3c283639bc6e104c925ff3e34007",
|
||||
"50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0",
|
||||
"50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a",
|
||||
"50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153",
|
||||
"50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a",
|
||||
"50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13",
|
||||
"50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4",
|
||||
"50|doi_dedup___::a9bc4453273b2d02648a5cb453195042",
|
||||
"50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7",
|
||||
"50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5",
|
||||
"50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283",
|
||||
"50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea",
|
||||
"50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"),
|
||||
"Exploring Multi-Scale Map Generalization and Design",
|
||||
"This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval."));
|
||||
|
||||
OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload();
|
||||
Relation rel = (Relation) atomicActions.get(1).getPayload();
|
||||
|
||||
assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue());
|
||||
assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource());
|
||||
assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget());
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
{"raid": "-9222092103004099540", "authors": ["Department of Archaeology & Museums", "Department of Archaeology and Museums", "Department Of Archaeology & Museums"], "subjects": ["Begamganj", "Raisen", "Bhopal", "Budhni", "Malwa site survey", "सीहोर", "Gauharganj", "बुधनी", "Budni", "Berasia"], "titles": ["Malwa site survey : Raisen District, Begamganj Tahsīl, photographic documentation", "Malwa site survey : Bhopal District, photographic documentation (version 1, TIFF files)", "Malwa site survey : Raisen District, Gauharganj Tahsīl, village finds", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 1)", "Malwa site survey: Bhopal District, Berasia Tahsīl, photographic documentation (with villages named)", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 2)", "Malwa site survey : Bhopal District, photographic documentation (version 2, JPEG files)"], "ids": ["50|doi_dedup___::7523d165970830dd857e6cbea4302adf", "50|doi_dedup___::02309ae8a9fae291df321e317f5c5330", "50|doi_dedup___::95347ba2c4264414fab39712ee7fe481", "50|doi_dedup___::970aa708fe667596754fd02a708780f5", "50|doi_dedup___::b7cd9128cc53b1257a4f000347f339b0", "50|doi_dedup___::c7d65da0ecedef4d2c702b9db197d90c", "50|doi_dedup___::addbb67cf5046e340f342ba091bcebfa"], "title": "Documentation of Malwa Region", "summary": "This project involves the documentation of the Malwa region through photographic surveys. The surveys were conducted by the Department of Archaeology and Museums, Madhya Pradesh, and cover various districts and tahsils. The documentation includes photographic records of sites, villages, and other relevant features. The project aims to provide a comprehensive understanding of the region's cultural and historical significance.", "startDate": "2019-03-06", "endDate": "2019-03-08"}
|
||||
{"raid": "-9221424331076109424", "authors": ["Hutchings, Judy", "Ward, Catherine", "Baban, Adriana", "D<><44>nil<69><6C>, Ingrid", "Frantz, Inga", "Gardner, Frances", "Lachman, Jamie", "Lachman, Jamie M.", "Foran, Heather", "Heinrichs, Nina", "Murphy, Hugh", "B<><42>ban, Adriana", "Raleva, Marija", "Fang, Xiangming", "Jansen, Elena", "Taut, Diana", "Foran, Heather M.", "T<><54>ut, Diana", "Ward, Catherine L.", "Williams, Margiad", "Lesco, Galina", "Brühl, Antonia"], "subjects": ["3. Good health", "5. Gender equality", "Criminology not elsewhere classified", "1. No poverty", "2. Zero hunger"], "titles": ["sj-docx-1-vaw-10.1177_10778012231188090 - Supplemental material for Co-Occurrence of Intimate Partner Violence Against Mothers and Maltreatment of Their Children With Behavioral Problems in Eastern Europe", "Hunger in vulnerable families in Southeastern Europe: Associations with health and violence", "Prevention of child mental health problems through parenting interventions in Southeastern Europe (RISE): study protocol for a multi-site randomised controlled trial"], "ids": ["50|doi_dedup___::a70015063e5400dae2e097ee10b4a589", "50|doi_dedup___::6e1d12026fcde9087724622ccdeed430", "50|doi_dedup___::5b7bd5d46c5d95e2ef5b36663504a67e"], "title": "Exploring the Impact of Hunger and Violence on Child Health in Southeastern Europe", "summary": "This study aims to investigate the relationship between hunger, violence, and child health in vulnerable families in Southeastern Europe. The research will explore the experiences of families in FYR Macedonia, Republic of Moldova, and Romania, and examine the associations between hunger, maltreatment, and other health indicators. The study will also test the efficacy of a parenting intervention targeting child behavioral problems in alleviating these issues. The findings of this research will contribute to the development of effective interventions to address the complex needs of vulnerable families in the region.", "startDate": "2019-06-04", "endDate": "2023-01-01"}
|
||||
{"raid": "-9219052635741785098", "authors": ["Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"], "subjects": ["cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"], "titles": ["Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment \"More is Less\" from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"], "ids": ["50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"], "title": "Exploring Multi-Scale Map Generalization and Design", "summary": "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval.", "startDate": "2021-09-10", "endDate": "2024-02-16"}
|
||||
{"raid": "-9216828847055450272", "authors": ["Grey, Alan", "Gorelov, Sergey", "Pall, Szilard", "Merz, Pascal", "Justin A., Lemkul", "Szilárd Páll", "Pasquadibisceglie, Andrea", "Kutzner, Carsten", "Schulz, Roland", "Nabet, Julien", "Abraham, Mark", "Jalalypour, Farzaneh", "Lundborg, Magnus", "Gray, Alan", "Villa, Alessandra", "Berk Hess", "Santuz, Hubert", "Irrgang, M. Eric", "Wingbermuehle, Sebastian", "Lemkul, Justin A.", "Jordan, Joe", "Pellegrino, Michele", "Doijade, Mahesh", "Shvetsov, Alexey", "Hess, Berk", "Behera, Sudarshan", "Andrey Alekseenko", "Shugaeva, Tatiana", "Fleischmann, Stefan", "Bergh, Cathrine", "Morozov, Dmitry", "Adam Hospital", "Briand, Eliane", "Lindahl, Erik", "Brown, Ania", "Marta Lloret Llinares", "Miletic, Vedran", "Alekseenko, Andrey", "Gouaillardet, Gilles", "Fiorin, Giacomo", "Basov, Vladimir"], "subjects": ["webinar"], "titles": ["Introduction to HPC: molecular dynamics simulations with GROMACS: log files", "BioExcel webinar #73: Competency frameworks to support training design and professional development", "Introduction to HPC: molecular dynamics simulations with GROMACS: output files - Devana", "GROMACS 2024.0 Manual", "BioExcel Webinar #71: GROMACS-PMX for accurate estimation of free energy differences", "Introduction to HPC: molecular dynamics simulations with GROMACS: input files", "BioExcel Webinar #68: What's new in GROMACS 2023", "BioExcel Webinar #69: BioBB-Wfs and BioBB-API, integrated web-based platform and programmatic interface for biomolecular simulations workflows using the BioExcel Building Blocks library", "GROMACS 2024-beta Source code"], "ids": ["50|doi_dedup___::8318fbc815ee1943c3269be7567f220b", "50|doi_dedup___::9530e03fb2aac63e82b18a40dc09e32c", "50|doi_dedup___::30174ab31075e76a428ca5b4f4d236b8", "50|doi_________::70b7c6dce09ae6f1361d22913fdf95eb", "50|doi_dedup___::337dd48600618f3c06257edd750d6201", "50|doi_dedup___::d622992ba9077617f37ebd268b3e806d", "50|doi_dedup___::0b0bcc6825d6c052c37882fd5cfc1e8c", "50|doi_dedup___::4b1541a7cee32527c65ace5d1ed57335", "50|doi_dedup___::1379861df59bd755e4fb39b9f95ffbd3"], "title": "Exploring High-Performance Computing and Biomolecular Simulations", "summary": "This project involves exploring high-performance computing (HPC) and biomolecular simulations using GROMACS. The objectives include understanding molecular dynamics simulations, log files, input files, and output files. Additionally, the project aims to explore competency frameworks for professional development, specifically in the field of computational biomolecular research. The tools and techniques used will include GROMACS, BioExcel Building Blocks, and competency frameworks. The expected outcomes include a deeper understanding of HPC and biomolecular simulations, as well as the development of skills in using GROMACS and BioExcel Building Blocks. The project will also contribute to the development of competency frameworks for professional development in the field of computational biomolecular research.", "startDate": "2023-04-25", "endDate": "2024-01-30"}
|
||||
{"raid": "-9210544816395499758", "authors": ["Bateson, Melissa", "Andrews, Clare", "Verhulst, Simon", "Nettle, Daniel", "Zuidersma, Erica"], "subjects": ["2. Zero hunger"], "titles": ["Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings", "Data and code archive for Andrews et al. 'Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings'"], "ids": ["50|doi_dedup___::176117239be06189523c253e0ca9c5ec", "50|doi_dedup___::343e0b0ddf0d54763a89a62af1f7a379"], "title": "Investigating the Effects of Food Insecurity on Energy Storage and Somatic Maintenance in European Starlings", "summary": "This study examines the impact of food insecurity on energy storage and somatic maintenance in European starlings. The research involved exposing juvenile starlings to either uninterrupted food availability or a regime of unpredictable food unavailability. The results show that birds exposed to food insecurity stored more energy, but at the expense of somatic maintenance and repair. The study provides insights into the adaptive responses of birds to food scarcity and the trade-offs involved in energy storage and maintenance.", "startDate": "2021-06-28", "endDate": "2021-06-28"}
|
||||
{"raid": "-9208499171224730388", "authors": ["Maniati, Eleni", "Bakker, Bjorn", "McClelland, Sarah E.", "Shaikh, Nadeem", "De Angelis, Simone", "Johnson, Sarah C.", "Wang, Jun", "Foijer, Floris", "Spierings, Diana C. J.", "Boemo, Michael A.", "Wardenaar, René", "Mazzagatti, Alice"], "subjects": [], "titles": ["Additional file 2 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses", "Additional file 5 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses"], "ids": ["50|doi_dedup___::a1bfeb173971f74a274fab8bdd78a4bc", "50|doi_dedup___::3d6e151aaeb2f7c40a320207fdd80ade"], "title": "Analysis of DNA Copy Number Alterations and Chromosome Scale Losses", "summary": "This study analyzed the effects of replication stress on DNA copy number alterations and chromosome scale losses. The results show distinctive landscapes of these alterations and losses, which were further investigated in additional files. The study provides valuable insights into the mechanisms of replication stress and its impact on genomic stability.", "startDate": "2022-01-01", "endDate": "2022-01-01"}
|
|
@ -0,0 +1,157 @@
|
|||
<PubmedArticle>
|
||||
<MedlineCitation Status="MEDLINE" IndexingMethod="Curated" Owner="NLM">
|
||||
<PMID Version="1">37318999</PMID>
|
||||
<DateCompleted>
|
||||
<Year>2024</Year>
|
||||
<Month>02</Month>
|
||||
<Day>09</Day>
|
||||
</DateCompleted>
|
||||
<DateRevised>
|
||||
<Year>2024</Year>
|
||||
<Month>02</Month>
|
||||
<Day>09</Day>
|
||||
</DateRevised>
|
||||
<Article PubModel="Print-Electronic">
|
||||
<Journal>
|
||||
<ISSN IssnType="Electronic">1522-1229</ISSN>
|
||||
<JournalIssue CitedMedium="Internet">
|
||||
<Volume>47</Volume>
|
||||
<Issue>3</Issue>
|
||||
<PubDate>
|
||||
<Year>2023</Year>
|
||||
<Month>Sep</Month>
|
||||
<Day>01</Day>
|
||||
</PubDate>
|
||||
</JournalIssue>
|
||||
<Title>Advances in physiology education</Title>
|
||||
<ISOAbbreviation>Adv Physiol Educ</ISOAbbreviation>
|
||||
</Journal>
|
||||
<ArticleTitle>Providing the choice of in-person or videoconference attendance in a clinical physiology course may harm learning outcomes for the entire cohort.</ArticleTitle>
|
||||
<Pagination>
|
||||
<MedlinePgn>548-556</MedlinePgn>
|
||||
</Pagination>
|
||||
<ELocationID EIdType="doi" ValidYN="Y">10.1152/advan.00160.2022</ELocationID>
|
||||
<Abstract>
|
||||
<AbstractText>Clinical Physiology 1 and 2 are flipped classes in which students watch prerecorded videos before class. During the 3-h class, students take practice assessments, work in groups on critical thinking exercises, work through case studies, and engage in drawing exercises. Due to the COVID pandemic, these courses were transitioned from in-person classes to online classes. Despite the university's return-to-class policy, some students were reluctant to return to in-person classes; therefore during the 2021-2022 academic year, Clinical Physiology 1 and 2 were offered as flipped, hybrid courses. In a hybrid format, students either attended the synchronous class in person or online. Here we evaluate the learning outcomes and the perceptions of the learning experience for students who attended Clinical Physiology 1 and 2 either online (2020-2021) or in a hybrid format (2021-2022). In addition to exam scores, in-class surveys and end of course evaluations were compiled to describe the student experience in the flipped hybrid setting. Retrospective linear mixed-model regression analysis of exam scores revealed that a hybrid modality (2021-2022) was associated with lower exam scores when controlling for sex, graduate/undergraduate status, delivery method, and the order in which the courses were taken (<i>F</i> test: <i>F</i> = 8.65, df1 = 2, df2 = 179.28, <i>P</i> = 0.0003). In addition, being a Black Indigenous Person of Color (BIPOC) student is associated with a lower exam score, controlling for the same previous factors (<i>F</i> test: <i>F</i> = 4.23, df1 = 1, df2 = 130.28, <i>P</i> = 0.04), albeit with lower confidence; the BIPOC representation in this sample is small (BIPOC: <i>n</i> = 144; total: <i>n</i> = 504). There is no significant interaction between the hybrid modality and race, meaning that BIPOC and White students are both negatively affected in a hybrid flipped course. Instructors should consider carefully about offering hybrid courses and build in extra student support.<b>NEW & NOTEWORTHY</b> The transition from online to in-person teaching has been as challenging as the original transition to remote teaching with the onset of the pandemic. Since not all students were ready to return to the classroom, students could choose to take this course in person or online. This arrangement provided flexibility and opportunities for innovative class activities for students but introduced tradeoffs in lower test scores from the hybrid modality than fully online or fully in-person modalities.</AbstractText>
|
||||
</Abstract>
|
||||
<AuthorList CompleteYN="Y">
|
||||
<Author ValidYN="Y">
|
||||
<LastName>Anderson</LastName>
|
||||
<ForeName>Lisa Carney</ForeName>
|
||||
<Initials>LC</Initials>
|
||||
<Identifier Source="ORCID">0000-0003-2261-1921</Identifier>
|
||||
<AffiliationInfo>
|
||||
<Affiliation>Department of Integrative Biology and Physiology, University of Minnesota, Minneapolis, Minnesota, United States.</Affiliation>
|
||||
<Identifier Source="ROR">https://ror.org/017zqws13</Identifier>
|
||||
</AffiliationInfo>
|
||||
</Author>
|
||||
<Author ValidYN="Y">
|
||||
<LastName>Jacobson</LastName>
|
||||
<ForeName>Tate</ForeName>
|
||||
<Initials>T</Initials>
|
||||
<AffiliationInfo>
|
||||
<Affiliation>Department of Statistics, University of Minnesota, Minneapolis, Minnesota, United States.</Affiliation>
|
||||
</AffiliationInfo>
|
||||
</Author>
|
||||
</AuthorList>
|
||||
<Language>eng</Language>
|
||||
<PublicationTypeList>
|
||||
<PublicationType UI="D016428">Journal Article</PublicationType>
|
||||
</PublicationTypeList>
|
||||
<ArticleDate DateType="Electronic">
|
||||
<Year>2023</Year>
|
||||
<Month>06</Month>
|
||||
<Day>15</Day>
|
||||
</ArticleDate>
|
||||
</Article>
|
||||
<MedlineJournalInfo>
|
||||
<Country>United States</Country>
|
||||
<MedlineTA>Adv Physiol Educ</MedlineTA>
|
||||
<NlmUniqueID>100913944</NlmUniqueID>
|
||||
<ISSNLinking>1043-4046</ISSNLinking>
|
||||
</MedlineJournalInfo>
|
||||
<CitationSubset>IM</CitationSubset>
|
||||
<MeshHeadingList>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D010827" MajorTopicYN="Y">Physiology</DescriptorName>
|
||||
<QualifierName UI="Q000193" MajorTopicYN="N">education</QualifierName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D012189" MajorTopicYN="N">Retrospective Studies</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D007858" MajorTopicYN="N">Learning</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D058873" MajorTopicYN="N">Pandemics</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D000086382" MajorTopicYN="N">COVID-19</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D012044" MajorTopicYN="N">Regression Analysis</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D013334" MajorTopicYN="N">Students</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D008297" MajorTopicYN="N">Male</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D005260" MajorTopicYN="N">Female</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D044465" MajorTopicYN="N">White People</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D044383" MajorTopicYN="N">Black People</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D020375" MajorTopicYN="N">Education, Distance</DescriptorName>
|
||||
</MeshHeading>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D003479" MajorTopicYN="N">Curriculum</DescriptorName>
|
||||
</MeshHeading>
|
||||
</MeshHeadingList>
|
||||
<KeywordList Owner="NOTNLM">
|
||||
<Keyword MajorTopicYN="N">flipped teaching</Keyword>
|
||||
<Keyword MajorTopicYN="N">hybrid teaching</Keyword>
|
||||
<Keyword MajorTopicYN="N">inequity</Keyword>
|
||||
<Keyword MajorTopicYN="N">learning outcomes</Keyword>
|
||||
<Keyword MajorTopicYN="N">responsive teaching</Keyword>
|
||||
</KeywordList>
|
||||
</MedlineCitation>
|
||||
<PubmedData>
|
||||
<History>
|
||||
<PubMedPubDate PubStatus="medline">
|
||||
<Year>2023</Year>
|
||||
<Month>7</Month>
|
||||
<Day>21</Day>
|
||||
<Hour>6</Hour>
|
||||
<Minute>44</Minute>
|
||||
</PubMedPubDate>
|
||||
<PubMedPubDate PubStatus="pubmed">
|
||||
<Year>2023</Year>
|
||||
<Month>6</Month>
|
||||
<Day>15</Day>
|
||||
<Hour>19</Hour>
|
||||
<Minute>14</Minute>
|
||||
</PubMedPubDate>
|
||||
<PubMedPubDate PubStatus="entrez">
|
||||
<Year>2023</Year>
|
||||
<Month>6</Month>
|
||||
<Day>15</Day>
|
||||
<Hour>12</Hour>
|
||||
<Minute>53</Minute>
|
||||
</PubMedPubDate>
|
||||
</History>
|
||||
<PublicationStatus>ppublish</PublicationStatus>
|
||||
<ArticleIdList>
|
||||
<ArticleId IdType="pubmed">37318999</ArticleId>
|
||||
<ArticleId IdType="doi">10.1152/advan.00160.2022</ArticleId>
|
||||
</ArticleIdList>
|
||||
</PubmedData>
|
||||
</PubmedArticle>
|
|
@ -5,7 +5,10 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
|||
import eu.dnetlib.dhp.schema.oaf.utils.PidType
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
@ -13,14 +16,16 @@ import org.junit.jupiter.api.Assertions._
|
|||
import org.junit.jupiter.api.extension.ExtendWith
|
||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||
import org.mockito.junit.jupiter.MockitoExtension
|
||||
import org.slf4j.LoggerFactory
|
||||
|
||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||
import java.util.regex.Pattern
|
||||
import java.util.zip.GZIPInputStream
|
||||
import javax.xml.stream.XMLInputFactory
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.io.Source
|
||||
import scala.xml.pull.XMLEventReader
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class BioScholixTest extends AbstractVocabularyTest {
|
||||
|
@ -48,6 +53,76 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
def testPid(): Unit = {
|
||||
val pids = List(
|
||||
"0000000163025705",
|
||||
"000000018494732X",
|
||||
"0000000308873343",
|
||||
"0000000335964515",
|
||||
"0000000333457333",
|
||||
"0000000335964515",
|
||||
"0000000302921949",
|
||||
"http://orcid.org/0000-0001-8567-3543",
|
||||
"http://orcid.org/0000-0001-7868-8528",
|
||||
"0000-0001-9189-1440",
|
||||
"0000-0003-3727-9247",
|
||||
"0000-0001-7246-1058",
|
||||
"000000033962389X",
|
||||
"0000000330371470",
|
||||
"0000000171236123",
|
||||
"0000000272569752",
|
||||
"0000000293231371",
|
||||
"http://orcid.org/0000-0003-3345-7333",
|
||||
"0000000340145688",
|
||||
"http://orcid.org/0000-0003-4894-1689"
|
||||
)
|
||||
|
||||
pids.foreach(pid => {
|
||||
val pidCleaned = new PMIdentifier(pid, "ORCID").getPid
|
||||
// assert pid is in the format of ORCID
|
||||
println(pidCleaned)
|
||||
assertTrue(pidCleaned.matches("[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]"))
|
||||
})
|
||||
}
|
||||
|
||||
def extractAffiliation(s: String): List[String] = {
|
||||
val regex: String = "<Affiliation>(.*)<\\/Affiliation>"
|
||||
val pattern = Pattern.compile(regex, Pattern.MULTILINE)
|
||||
val matcher = pattern.matcher(s)
|
||||
val l: mutable.ListBuffer[String] = mutable.ListBuffer()
|
||||
while (matcher.find()) {
|
||||
l += matcher.group(1)
|
||||
}
|
||||
l.toList
|
||||
}
|
||||
|
||||
case class AuthorPID(pidType: String, pid: String) {}
|
||||
|
||||
def extractAuthorIdentifier(s: String): List[AuthorPID] = {
|
||||
val regex: String = "<Identifier Source=\"(.*)\">(.*)<\\/Identifier>"
|
||||
val pattern = Pattern.compile(regex, Pattern.MULTILINE)
|
||||
val matcher = pattern.matcher(s)
|
||||
val l: mutable.ListBuffer[AuthorPID] = mutable.ListBuffer()
|
||||
while (matcher.find()) {
|
||||
l += AuthorPID(pidType = matcher.group(1), pid = matcher.group(2))
|
||||
}
|
||||
l.toList
|
||||
}
|
||||
|
||||
@Test
|
||||
def testParsingPubmed2(): Unit = {
|
||||
val mapper = new ObjectMapper()
|
||||
val xml = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml"))
|
||||
val parser = new PMParser2()
|
||||
val article = parser.parse(xml)
|
||||
|
||||
// println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article))
|
||||
|
||||
println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(PubMedToOaf.convert(article, vocabularies)))
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testEBIData() = {
|
||||
val inputFactory = XMLInputFactory.newInstance
|
||||
|
@ -124,6 +199,14 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
}
|
||||
}
|
||||
|
||||
def testPubmedSplitting(): Unit = {
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate()
|
||||
new SparkCreatePubmedDump("", Array.empty, LoggerFactory.getLogger(getClass))
|
||||
.createPubmedDump(spark, "/home/sandro/Downloads/pubmed", "/home/sandro/Downloads/pubmed_mapped", vocabularies)
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testPubmedOriginalID(): Unit = {
|
||||
val article: PMArticle = new PMArticle
|
||||
|
|
|
@ -130,6 +130,7 @@ public class ResultTagger implements Serializable {
|
|||
// log.info("Remove constraints for " + communityId);
|
||||
if (conf.getRemoveConstraintsMap().keySet().contains(communityId) &&
|
||||
conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
!conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
||||
conf
|
||||
.getRemoveConstraintsMap()
|
||||
.get(communityId)
|
||||
|
@ -161,29 +162,30 @@ public class ResultTagger implements Serializable {
|
|||
|
||||
// Tagging for datasource
|
||||
final Set<String> datasources = new HashSet<>();
|
||||
final Set<String> collfrom = new HashSet<>();
|
||||
final Set<String> cfhb = new HashSet<>();
|
||||
final Set<String> hostdby = new HashSet<>();
|
||||
|
||||
if (Objects.nonNull(result.getInstance())) {
|
||||
for (Instance i : result.getInstance()) {
|
||||
if (Objects.nonNull(i.getCollectedfrom()) && Objects.nonNull(i.getCollectedfrom().getKey())) {
|
||||
collfrom.add(i.getCollectedfrom().getKey());
|
||||
cfhb.add(i.getCollectedfrom().getKey());
|
||||
}
|
||||
if (Objects.nonNull(i.getHostedby()) && Objects.nonNull(i.getHostedby().getKey())) {
|
||||
cfhb.add(i.getHostedby().getKey());
|
||||
hostdby.add(i.getHostedby().getKey());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
collfrom
|
||||
cfhb
|
||||
.forEach(
|
||||
dsId -> datasources
|
||||
.addAll(
|
||||
conf.getCommunityForDatasource(dsId, param)));
|
||||
hostdby.forEach(dsId -> {
|
||||
datasources
|
||||
.addAll(
|
||||
conf.getCommunityForDatasource(dsId, param));
|
||||
// datasources
|
||||
// .addAll(
|
||||
// conf.getCommunityForDatasource(dsId, param));
|
||||
if (conf.isEoscDatasource(dsId)) {
|
||||
datasources.add("eosc");
|
||||
}
|
||||
|
@ -226,6 +228,7 @@ public class ResultTagger implements Serializable {
|
|||
.forEach(communityId -> {
|
||||
if (!removeCommunities.contains(communityId) &&
|
||||
conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null &&
|
||||
!conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() &&
|
||||
conf
|
||||
.getSelectionConstraintsMap()
|
||||
.get(communityId)
|
||||
|
|
|
@ -33,6 +33,8 @@ public class SelectionConstraints implements Serializable {
|
|||
|
||||
// Constraints in or
|
||||
public boolean verifyCriteria(final Map<String, List<String>> param) {
|
||||
if (criteria.isEmpty())
|
||||
return true;
|
||||
for (Constraints selc : criteria) {
|
||||
if (selc.verifyCriteria(param)) {
|
||||
return true;
|
||||
|
|
|
@ -319,7 +319,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//*[local-name()='description' and ./@descriptionType='Abstract']", info);
|
||||
return prepareListFields(doc, "//datacite:description[./@descriptionType='Abstract'] | //dc:description", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -906,6 +906,30 @@ class MappersTest {
|
|||
assertEquals("IT", p.getCountry().get(0).getClassid());
|
||||
assertEquals("FR", p.getCountry().get(1).getClassid());
|
||||
assertEquals("DE", p.getCountry().get(2).getClassid());
|
||||
|
||||
assertNotNull(p.getDescription());
|
||||
assertEquals(1, p.getDescription().size());
|
||||
assertNotNull(p.getDescription().get(0));
|
||||
assertTrue(StringUtils.isNotBlank(p.getDescription().get(0).getValue()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testODFRecord_guidelines4() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
final Publication p = (Publication) list.get(0);
|
||||
assertValidId(p.getId());
|
||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||
|
||||
assertNotNull(p.getDescription());
|
||||
assertEquals(2, p.getDescription().size());
|
||||
assertNotNull(p.getDescription().get(0));
|
||||
assertTrue(StringUtils.isNotBlank(p.getDescription().get(0).getValue()));
|
||||
assertNotNull(p.getDescription().get(1));
|
||||
assertTrue(StringUtils.isNotBlank(p.getDescription().get(1).getValue()));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -0,0 +1,95 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:datacite="http://datacite.org/schema/kernel-3"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||
<header xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:dri="http://www.driver-repository.eu/namespace/dri">
|
||||
<dri:objIdentifier>od______1261::66de86a37fa980b9b8f4cefdf1c5a0c5</dri:objIdentifier>
|
||||
<dri:recordIdentifier>oai:air.unimi.it:2434/636308</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2020-09-04T14:36:48.411Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>od______1261</oaf:datasourceprefix>
|
||||
<identifier>oai:air.unimi.it:2434/636308</identifier>
|
||||
<datestamp>2022-10-14T14:17:28Z</datestamp>
|
||||
<setSpec>com_2434_73555</setSpec>
|
||||
<setSpec>col_2434_73557</setSpec>
|
||||
<setSpec>openaire</setSpec>
|
||||
<dr:dateOfTransformation>2024-11-07T06:17:52.454Z</dr:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaire:resource xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
||||
xmlns:exslt="http://exslt.org/common"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:rdf="http://www.w3.org/TR/rdf-concepts/"
|
||||
xmlns:doc="http://www.lyncode.com/xoai"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:vc="http://www.w3.org/2007/XMLSchema-versioning"
|
||||
xmlns="http://www.openarchives.org/OAI/2.0/"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/schema/oaire/ https://www.openaire.eu/schema/repo-lit/4.0/openaire.xsd">
|
||||
<datacite:titles>
|
||||
<datacite:title xml:lang="en">I processi incorpati di costruzione della conoscenza nelle pratiche di cura</datacite:title>
|
||||
</datacite:titles>
|
||||
<datacite:creators>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>S. Visioli</datacite:creatorName>
|
||||
<nameIdentifier nameIdentifierScheme="IRIS">rp17381</nameIdentifier>
|
||||
<affiliation affiliationIdentifierScheme="ROR"
|
||||
affiliationIdentifier="https://ror.org/00wjc7c48">University of Milan</affiliation>
|
||||
<nameIdentifier nameIdentifierScheme="UGOV">53567</nameIdentifier>
|
||||
</datacite:creator>
|
||||
<datacite:creator>
|
||||
<datacite:creatorName>L. Zannini</datacite:creatorName>
|
||||
<datacite:nameIdentifier schemeURI="https://orcid.org" nameIdentifierScheme="ORCID">0000-0001-7287-6616</datacite:nameIdentifier>
|
||||
<nameIdentifier nameIdentifierScheme="IRIS">rp31993</nameIdentifier>
|
||||
<affiliation affiliationIdentifierScheme="ROR"
|
||||
affiliationIdentifier="https://ror.org/00wjc7c48">University of Milan</affiliation>
|
||||
<nameIdentifier nameIdentifierScheme="UGOV">5556</nameIdentifier>
|
||||
</datacite:creator>
|
||||
</datacite:creators>
|
||||
<oaire:citationStartPage>487</oaire:citationStartPage>
|
||||
<oaire:citationIssue>2</oaire:citationIssue>
|
||||
<oaire:citationEndPage>506</oaire:citationEndPage>
|
||||
<oaire:citationVolume>8</oaire:citationVolume>
|
||||
<oaire:citationTitle>METIS</oaire:citationTitle>
|
||||
<datacite:alternateIdentifiers>
|
||||
<datacite:alternateIdentifier alternateIdentifierType="DOI">10.30557/MT00041</datacite:alternateIdentifier>
|
||||
<datacite:alternateIdentifier alternateIdentifierType="URN">http://hdl.handle.net/2434/636308</datacite:alternateIdentifier>
|
||||
</datacite:alternateIdentifiers>
|
||||
<datacite:dates>
|
||||
<datacite:date dateType="Accepted">2018</datacite:date>
|
||||
<datacite:date dateType="Issued">2018</datacite:date>
|
||||
<datacite:date dateType="Available">2019-05-23</datacite:date>
|
||||
</datacite:dates>
|
||||
<dc:language>ita</dc:language>
|
||||
<oaire:resourceType resourceTypeGeneral="literature"
|
||||
uri="http://purl.org/coar/resource_type/c_6501">journal article</oaire:resourceType>
|
||||
<dc:description>In ambito educativo, il corpo è stato tradizionalmente marginalizzato o rifiutato come fonte di conoscenza. Negli ultimi decenni, la letteratura pedagogica ha manifestato crescente interesse per il ruolo che ha il corpo dell’insegnante, nel pianificare e realizzare l’insegnamento, sfidando le ideologie e le epistemologie dominanti, secondo le quali la nostra mente è la fonte primaria di apprendimento. Al contrario, una pedagogia della conoscenza incorpata (embodiment) considera il corpo centrale nel nostro conoscere (le pratiche d’insegnamento) e nel nostro essere (insegnanti). Analogamente, la letteratura delle professioni del caring si è rivelata molto attenta ai temi della conoscenza incorpata, ossia di quel sapere che è presente in un corpo, spesso in modo tacito. Le infermiere hanno da tempo esplorato il ruolo del loro corpo nei processi di costruzione della conoscenza, come anche il ruolo del corpo dello studente, e addirittura del paziente, nel conoscere la malattia. In questo paper riportiamo alcune riflessioni ed esperienze sulla conoscenza incorpata nelle pratiche assistenziali, mettendo in luce come questa, benché spesso tacita, abbia un ruolo centrale nell’aver cura delle persone malate.</dc:description>
|
||||
<dc:description>In educational contexts, the body has been traditionally marginalized or rejected as a source of knowledge. In the last decades, some pedagogical contributions have shown interest for the role that the teacher’s body has in planning and implementing teaching, challenging dominant ideologies and epistemologies that tell us our minds are the primary sources of learning. Conversely, a pedagogy of embodiment makes the body central in our knowing (educational practices) and being (a teacher). Similarly, literature from the caring professions has revealed very receptive to-ward issues related to embodied knowledge, that is to say the knowledge that the body owns, which is frequently tacit. Since many years, nurses have explored the role of body in the processes of knowledge building, as well as the role of the student’s body, and even the patient’s body, in knowing the illness. In this paper we report some reflections and experiences concerning embodied knowledge in the nursing practice, highlighting that embodied knowledge, even if tacit, has a pivotal role when caring for sick people.</dc:description>
|
||||
<dc:format>application/pdf</dc:format>
|
||||
<datacite:identifier xmlns:datacite="http://datacite.org/schema/kernel-3"
|
||||
identifierType="Handle">2434/636308</datacite:identifier>
|
||||
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
|
||||
<datacite:subjects>
|
||||
<datacite:subject>body; educational practices; embodied knowledge; nursing; teaching</datacite:subject>
|
||||
</datacite:subjects>
|
||||
<datacite:sizes/>
|
||||
<datacite:sizes>
|
||||
<datacite:size>413926 bytes</datacite:size>
|
||||
</datacite:sizes>
|
||||
<datacite:sizes/>
|
||||
<oaire:file accessRightsURI="http://purl.org/coar/access_right/c_14cb"
|
||||
mimeType="application/pdf"
|
||||
objectType="fulltext">https://air.unimi.it/bitstream/2434/636308/2/Finale%20pubblicato.pdf</oaire:file>
|
||||
</oaire:resource>
|
||||
<oaf:fulltext>https://air.unimi.it/bitstream/2434/636308/2/Finale%20pubblicato.pdf</oaf:fulltext>
|
||||
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2018-01-01</oaf:dateAccepted>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:language>ita</oaf:language>
|
||||
<oaf:hostedBy name="Archivio Istituzionale della Ricerca dell'Università degli Studi di Milano"
|
||||
id="opendoar____::1261"/>
|
||||
<oaf:collectedFrom name="Archivio Istituzionale della Ricerca dell'Università degli Studi di Milano"
|
||||
id="opendoar____::1261"/>
|
||||
</metadata>
|
||||
</record>
|
|
@ -174,6 +174,8 @@ public class ProvisionModelSupport {
|
|||
&& StringUtils.isNotBlank(relation.getValidationDate())) {
|
||||
rr.setValidationDate(relation.getValidationDate());
|
||||
}
|
||||
rr.setGivenName(re.getGivenName());
|
||||
rr.setFamilyName(re.getFamilyName());
|
||||
|
||||
return rr;
|
||||
}
|
||||
|
@ -208,11 +210,31 @@ public class ProvisionModelSupport {
|
|||
ps.setAlternativeNames(p.getAlternativeNames());
|
||||
ps.setBiography(p.getBiography());
|
||||
ps.setConsent(p.getConsent());
|
||||
// ps.setSubject(...));
|
||||
ps.setSubject(mapPersonTopics(p.getSubject()));
|
||||
|
||||
return ps;
|
||||
}
|
||||
|
||||
private static List<PersonTopic> mapPersonTopics(List<eu.dnetlib.dhp.schema.oaf.PersonTopic> subjects) {
|
||||
return Optional
|
||||
.ofNullable(subjects)
|
||||
.map(
|
||||
ss -> ss
|
||||
.stream()
|
||||
.map(ProvisionModelSupport::mapPersonTopic)
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
private static PersonTopic mapPersonTopic(eu.dnetlib.dhp.schema.oaf.PersonTopic pt) {
|
||||
PersonTopic topic = new PersonTopic();
|
||||
topic.setValue(pt.getValue());
|
||||
topic.setSchema(pt.getSchema());
|
||||
topic.setFromYear(pt.getFromYear());
|
||||
topic.setToYear(pt.getToYear());
|
||||
return topic;
|
||||
}
|
||||
|
||||
private static Funding mapFunding(List<String> fundingtree, VocabularyGroup vocs) {
|
||||
SAXReader reader = new SAXReader();
|
||||
return Optional
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -937,7 +937,7 @@
|
|||
<commons.logging.version>1.1.3</commons.logging.version>
|
||||
<commons-validator.version>1.7</commons-validator.version>
|
||||
<dateparser.version>1.0.7</dateparser.version>
|
||||
<dhp-schemas.version>[9.0.0]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[10.0.0]</dhp-schemas.version>
|
||||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<dhp.guava.version>11.0.2</dhp.guava.version>
|
||||
|
|
Loading…
Reference in New Issue