This commit is contained in:
Miriam Baglioni 2021-11-25 10:57:06 +01:00
parent 9a4c2aff07
commit 38065d6ed6
3 changed files with 45 additions and 42 deletions

View File

@ -210,11 +210,16 @@ public class DoiBoostAuthorMerger {
enrich.setPid(new ArrayList<>()); enrich.setPid(new ArrayList<>());
} }
Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet()); Set<String> aPids = enrich.getPid().stream().map(p -> pidToComparableString(p)).collect(Collectors.toSet());
ArrayList<StructuredProperty> newPids = new ArrayList<>();
newPids.addAll(enrich.getPid());
enriching.getPid().forEach(p -> { enriching.getPid().forEach(p -> {
if (!aPids.contains(pidToComparableString(p))) { String pidToComparableString = pidToComparableString(p);
enrich.getPid().add(p); if (!aPids.contains(pidToComparableString)) {
newPids.add(p);
aPids.add(pidToComparableString);
} }
}); });
enrich.setPid(newPids);
if (enrich.getAffiliation() == null) { if (enrich.getAffiliation() == null) {
if (enriching.getAffiliation() != null) { if (enriching.getAffiliation() != null) {
enrich.setAffiliation(enriching.getAffiliation()); enrich.setAffiliation(enriching.getAffiliation());

View File

@ -86,7 +86,7 @@
<case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case> <case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
<case to="ProcessORCID">${wf:conf('resumeFrom') eq 'ProcessORCID'}</case> <case to="ProcessORCID">${wf:conf('resumeFrom') eq 'ProcessORCID'}</case>
<case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case> <case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
<case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case> <!-- <case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>-->
<default to="ConvertCrossrefToOAF"/> <default to="ConvertCrossrefToOAF"/>
</switch> </switch>
</decision> </decision>
@ -226,40 +226,40 @@
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
</spark> </spark>
<ok to="GenerateActionSet"/>
<error to="Kill"/>
</action>
<action name="GenerateActionSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Generate DOIBoost ActionSet</name>
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
<arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
<arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
<arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
<arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
<arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
<arg>--sFilePath</arg><arg>${outputPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- <action name="GenerateActionSet">-->
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
<!-- <master>yarn-cluster</master>-->
<!-- <mode>cluster</mode>-->
<!-- <name>Generate DOIBoost ActionSet</name>-->
<!-- <class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>-->
<!-- <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
<!-- <spark-opts>-->
<!-- &#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!-- &#45;&#45;executor-cores=${sparkExecutorCores}-->
<!-- &#45;&#45;driver-memory=${sparkDriverMemory}-->
<!-- &#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!-- &#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!-- &#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!-- &#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!-- &#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!-- </spark-opts>-->
<!-- <arg>&#45;&#45;dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>-->
<!-- <arg>&#45;&#45;dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>-->
<!-- <arg>&#45;&#45;crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>-->
<!-- <arg>&#45;&#45;dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>-->
<!-- <arg>&#45;&#45;dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>-->
<!-- <arg>&#45;&#45;targetPath</arg><arg>${workingPath}/actionDataSet</arg>-->
<!-- <arg>&#45;&#45;sFilePath</arg><arg>${outputPath}</arg>-->
<!-- <arg>&#45;&#45;master</arg><arg>yarn-cluster</arg>-->
<!-- </spark>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -225,28 +225,26 @@ public class GraphCleaningFunctionsTest {
GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
} }
@Test @Test
public void testCleanDoiBoost() throws IOException { public void testCleanDoiBoost() throws IOException {
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json")); String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json"));
Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out); Publication cleaned = GraphCleaningFunctions.cleanup(p_out);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
Assertions.assertEquals(true,GraphCleaningFunctions.filter(cleaned) );
} }
@Test @Test
public void testCleanDoiBoost2() throws IOException { public void testCleanDoiBoost2() throws IOException {
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json")); String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json"));
Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out); Publication cleaned = GraphCleaningFunctions.cleanup(p_out);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
Assertions.assertEquals(true,GraphCleaningFunctions.filter(cleaned) );
} }
} }