refactoring

This commit is contained in:
Miriam Baglioni 2020-04-24 10:47:43 +02:00
parent 0e447add66
commit adcbf0e29a
8 changed files with 1474 additions and 1283 deletions

View File

@ -0,0 +1,7 @@
#sandboxName when not provided explicitly will be generated
sandboxName=${sandboxName}
sandboxDir=/user/${dhp.hadoop.frontend.user.name}/${sandboxName}
workingDir=${sandboxDir}/working_dir
oozie.wf.application.path = ${nameNode}${sandboxDir}/${oozieAppDir}
oozieTopWfApplicationPath = ${oozie.wf.application.path}

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dhp; package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -28,7 +28,7 @@ public class SparkBulkTagJob2 {
String jsonConfiguration = String jsonConfiguration =
IOUtils.toString( IOUtils.toString(
SparkBulkTagJob2.class.getResourceAsStream( SparkBulkTagJob2.class.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/input_bulktag_parameters.json")); "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);

View File

@ -65,13 +65,16 @@ public class ResultTagger implements Serializable {
// tagging for Subject // tagging for Subject
final Set<String> subjects = new HashSet<>(); final Set<String> subjects = new HashSet<>();
result.getSubject().stream() Optional<List<StructuredProperty>> oresultsubj = Optional.ofNullable(result.getSubject());
if (oresultsubj.isPresent()) {
oresultsubj.get().stream()
.map(subject -> subject.getValue()) .map(subject -> subject.getValue())
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)
.map(String::toLowerCase) .map(String::toLowerCase)
.map(String::trim) .map(String::trim)
.collect(Collectors.toCollection(HashSet::new)) .collect(Collectors.toCollection(HashSet::new))
.forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s))); .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
}
communities.addAll(subjects); communities.addAll(subjects);
@ -79,23 +82,32 @@ public class ResultTagger implements Serializable {
final Set<String> datasources = new HashSet<>(); final Set<String> datasources = new HashSet<>();
final Set<String> tmp = new HashSet<>(); final Set<String> tmp = new HashSet<>();
for (Instance i : result.getInstance()) { Optional<List<Instance>> oresultinstance = Optional.ofNullable(result.getInstance());
if (oresultinstance.isPresent()) {
for (Instance i : oresultinstance.get()) {
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|")); tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|")); tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
} }
result.getInstance().stream() oresultinstance.get().stream()
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey())) .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
.flatMap(p -> Stream.of(p.getFst(), p.getSnd())) .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
.map(s -> StringUtils.substringAfter(s, "|")) .map(s -> StringUtils.substringAfter(s, "|"))
.collect(Collectors.toCollection(HashSet::new)) .collect(Collectors.toCollection(HashSet::new))
.forEach(dsId -> datasources.addAll(conf.getCommunityForDatasource(dsId, param))); .forEach(
dsId ->
datasources.addAll(
conf.getCommunityForDatasource(dsId, param)));
}
communities.addAll(datasources); communities.addAll(datasources);
/*Tagging for Zenodo Communities*/ /*Tagging for Zenodo Communities*/
final Set<String> czenodo = new HashSet<>(); final Set<String> czenodo = new HashSet<>();
result.getContext().stream()
Optional<List<Context>> oresultcontext = Optional.ofNullable(result.getContext());
if (oresultcontext.isPresent()) {
oresultcontext.get().stream()
.filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR)) .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
.collect(Collectors.toList()) .collect(Collectors.toList())
.forEach( .forEach(
@ -103,8 +115,10 @@ public class ResultTagger implements Serializable {
czenodo.addAll( czenodo.addAll(
conf.getCommunityForZenodoCommunityValue( conf.getCommunityForZenodoCommunityValue(
c.getId() c.getId()
.substring(c.getId().lastIndexOf("/") + 1) .substring(
c.getId().lastIndexOf("/") + 1)
.trim()))); .trim())));
}
communities.addAll(czenodo); communities.addAll(czenodo);

View File

@ -67,8 +67,8 @@
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging</name> <name>bulkTagging-publication</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob2</class> <class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob2</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar> <jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--num-executors=${sparkExecutorNumber} --num-executors=${sparkExecutorNumber}
@ -96,8 +96,8 @@
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging</name> <name>bulkTagging-dataset</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob2</class> <class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob2</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar> <jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--num-executors=${sparkExecutorNumber} --num-executors=${sparkExecutorNumber}
@ -125,8 +125,8 @@
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging</name> <name>bulkTagging-orp</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob2</class> <class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob2</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar> <jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--num-executors=${sparkExecutorNumber} --num-executors=${sparkExecutorNumber}
@ -154,8 +154,8 @@
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging</name> <name>bulkTagging-software</name>
<class>eu.dnetlib.dhp.SparkBulkTagJob2</class> <class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob2</class>
<jar>dhp-bulktag-${projectVersion}.jar</jar> <jar>dhp-bulktag-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--num-executors=${sparkExecutorNumber} --num-executors=${sparkExecutorNumber}

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp;
import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR; import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bulktag.SparkBulkTagJob2;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;

View File

@ -141,4 +141,15 @@ public class CommunityConfigurationFactoryTest {
System.out.println(cc.toJson()); System.out.println(cc.toJson());
} }
@Test
public void temporaneo() throws Exception {
String xml =
IOUtils.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml"));
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
System.out.println(cc.toJson());
}
} }

View File

@ -9,8 +9,17 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 2 <community id="fet-fp7">
<community id="clarin"> <subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 3 <community id="fet-h2020">
<subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 4 <community id="clarin">
<subjects/> <subjects/>
<datasources> <datasources>
<datasource> <datasource>
@ -20,39 +29,40 @@
</datasources> </datasources>
<zenodocommunities/> <zenodocommunities/>
<organizations/> <organizations/>
</community> </community> Result: 5 <community id="rda">
<community id="ee"> <subjects/>
<datasources/>
<zenodocommunities>
<zenodocommunity>
<zenodoid>rda</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
<organizations/>
</community> Result: 6 <community id="ee">
<subjects> <subjects>
<subject>SDG13 - Climate action</subject> <subject>SDG13 - Climate action</subject>
<subject>SDG8 - Decent work and economic <subject>SDG8 - Decent work and economic growth</subject>
growth</subject>
<subject>SDG15 - Life on land</subject> <subject>SDG15 - Life on land</subject>
<subject>SDG2 - Zero hunger</subject> <subject>SDG2 - Zero hunger</subject>
<subject>SDG17 - Partnerships for the <subject>SDG17 - Partnerships for the goals</subject>
goals</subject>
<subject>SDG10 - Reduced inequalities</subject> <subject>SDG10 - Reduced inequalities</subject>
<subject>SDG5 - Gender equality</subject> <subject>SDG5 - Gender equality</subject>
<subject>SDG12 - Responsible <subject>SDG12 - Responsible consumption and production</subject>
consumption and production</subject>
<subject>SDG14 - Life below water</subject> <subject>SDG14 - Life below water</subject>
<subject>SDG6 - Clean water and <subject>SDG6 - Clean water and sanitation</subject>
sanitation</subject>
<subject>SDG11 - Sustainable cities and communities</subject> <subject>SDG11 - Sustainable cities and communities</subject>
<subject>SDG1 - No poverty</subject> <subject>SDG1 - No poverty</subject>
<subject>SDG3 - <subject>SDG3 - Good health and well being</subject>
Good health and well being</subject>
<subject>SDG7 - Affordable and clean energy</subject> <subject>SDG7 - Affordable and clean energy</subject>
<subject>SDG4 - Quality <subject>SDG4 - Quality education</subject>
education</subject>
<subject>SDG9 - Industry innovation and infrastructure</subject> <subject>SDG9 - Industry innovation and infrastructure</subject>
<subject>SDG16 - Peace justice <subject>SDG16 - Peace justice and strong institutions</subject>
and strong institutions</subject>
</subjects> </subjects>
<datasources/> <datasources/>
<zenodocommunities/> <zenodocommunities/>
<organizations/> <organizations/>
</community> </community> Result: 7 <community id="dh-ch">
<community id="dh-ch">
<subjects> <subjects>
<subject>modern art</subject> <subject>modern art</subject>
<subject>monuments</subject> <subject>monuments</subject>
@ -243,8 +253,7 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 8 <community id="fam">
<community id="fam">
<subjects> <subjects>
<subject>Stock Assessment</subject> <subject>Stock Assessment</subject>
<subject>pelagic</subject> <subject>pelagic</subject>
@ -363,8 +372,7 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 9 <community id="ni">
<community id="ni">
<subjects> <subjects>
<subject>brain mapping</subject> <subject>brain mapping</subject>
<subject>brain imaging</subject> <subject>brain imaging</subject>
@ -478,8 +486,7 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 10 <community id="mes">
<community id="mes">
<subjects> <subjects>
<subject>marine</subject> <subject>marine</subject>
<subject>ocean</subject> <subject>ocean</subject>
@ -679,8 +686,7 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 11 <community id="instruct">
<community id="instruct">
<subjects/> <subjects/>
<datasources/> <datasources/>
<zenodocommunities> <zenodocommunities>
@ -694,8 +700,12 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 12 <community id="elixir-gr">
<community id="aginfra"> <subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 13 <community id="aginfra">
<subjects> <subjects>
<subject>animal production and health</subject> <subject>animal production and health</subject>
<subject>fisheries and aquaculture</subject> <subject>fisheries and aquaculture</subject>
@ -817,8 +827,7 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 14 <community id="dariah">
<community id="dariah">
<subjects/> <subjects/>
<datasources> <datasources>
<datasource> <datasource>
@ -837,8 +846,17 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 15 <community id="risis">
<community id="beopen"> <subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 16 <community id="epos">
<subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 17 <community id="beopen">
<subjects> <subjects>
<subject>Green Transport</subject> <subject>Green Transport</subject>
<subject>City mobility systems</subject> <subject>City mobility systems</subject>
@ -1136,8 +1154,125 @@
</zenodocommunity> </zenodocommunity>
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community> Result: 18 <community id="euromarine">
<community id="covid-19"> <subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 19 <community id="ifremer">
<subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 20 <community id="oa-pg">
<subjects/>
<datasources/>
<zenodocommunities/>
<organizations/>
</community> Result: 21 <community id="science-innovation-policy">
<subjects>
<subject>Sustainability-oriented science policy</subject>
<subject> STI policies</subject>
<subject>science—society relations</subject>
<subject>Science &amp; Technology Policy</subject>
<subject>Innovation policy</subject>
<subject>science policy</subject>
<subject>Policy and Law</subject>
</subjects>
<datasources>
<datasource>
<openaireId>doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::ae4c7286c79590f19fdca670156ce816</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::0f664bce92ce953e0c7a92068c46bfb3</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::00017183dc4c858fb77541985323a4ef</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::93b306f458cce3d7aaaf58c0a725f4f9</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::9dbf8fbf3e9fe0fe1fc01e55fbd90bfc</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::a2bda8785c863279bba4b8f34827b4c9</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::019a1fcb42c3fea1c1b689df76330b58</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::0daa8281938831e9c82bfed8b55a2975</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::f67ad6d268162079b3abd51a24468744</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::c6f0ed5fa41e98863e7c73501fe4bd6d</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::ad114356e196a4a3d84dda59c720dacd</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::01e8a54fdecaaf354c67a2dd74ae7d4f</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::449305f096b10a9464449ff2d0e10e06</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::982c0c0ac378256254cce2fa6572bb6c</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::49d6ed47138884566ce93cf0ccb12c02</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::a98e820dbc2e8ee0fc84ab66f263267c</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::50b1ce37427b36368f8f0f1317e47f83</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::f0ec29b7450b2ac5d0ad45327eeb531a</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::d8d421d3b0349a7aaa93758b27a54e84</openaireId>
<selcriteria/>
</datasource>
<datasource>
<openaireId>doajarticles::7ffc35ac5133da01d421ccf8af5b70bc</openaireId>
<selcriteria/>
</datasource>
</datasources>
<zenodocommunities>
<zenodocommunity>
<zenodoid>risis</zenodoid>
<selcriteria/>
</zenodocommunity>
</zenodocommunities>
<organizations/>
</community> Result: 22 <community id="covid-19">
<subjects> <subjects>
<subject>COVID-19</subject> <subject>COVID-19</subject>
<subject>Severe acute respiratory syndrome coronavirus 2</subject> <subject>Severe acute respiratory syndrome coronavirus 2</subject>
@ -1157,7 +1292,6 @@
<openaireId>opendoar____::358aee4cc897452c00244351e4d91f69</openaireId> <openaireId>opendoar____::358aee4cc897452c00244351e4d91f69</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]}, <selcriteria>{"criteria":[{"constraint":[{"verb":"contains","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]}, {"constraint":[{"verb":"contains","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains","field":"title","value":"sars-cov-2"}]},
{"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]} {"constraint":[{"verb":"contains","field":"title","value":"2019-nCoV"}]}]}
</selcriteria> </selcriteria>
</datasource> </datasource>