This commit is contained in:
Claudio Atzori 2023-01-31 14:39:53 +01:00
commit d05ca53a14
8 changed files with 119 additions and 73 deletions

View File

@ -44,7 +44,7 @@ public class PropagationConstant {
public final static String NULL = "NULL"; public final static String NULL = "NULL";
public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional"; public static final String INSTITUTIONAL_REPO_TYPE = "institutional";
public static final String PROPAGATION_DATA_INFO_TYPE = "propagation"; public static final String PROPAGATION_DATA_INFO_TYPE = "propagation";
@ -231,9 +231,9 @@ public class PropagationConstant {
if (HdfsSupport.exists(inputPath, spark.sparkContext().hadoopConfiguration())) { if (HdfsSupport.exists(inputPath, spark.sparkContext().hadoopConfiguration())) {
return spark return spark
.read() .read()
.textFile(inputPath) .textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); .map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
} else { } else {
return spark.emptyDataset(Encoders.bean(clazz)); return spark.emptyDataset(Encoders.bean(clazz));
} }

View File

@ -1,8 +1,10 @@
package eu.dnetlib.dhp.bulktag.community; package eu.dnetlib.dhp.bulktag.community;
import java.io.IOException;
import java.util.List; import java.util.List;
import org.apache.commons.io.IOUtils;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
@ -13,74 +15,17 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class QueryInformationSystem { public class QueryInformationSystem {
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+ " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
+ " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
+ " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
+ " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
+ " let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text() "
+ " let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text() "
+
"let $zenodo := $x//param[./@name='zenodoCommunity']/text() "
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden' "
+ " return "
+ " <community> "
+ " { $x//CONFIGURATION/context/@id} "
+ " <advancedConstraints>" +
"{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }" +
"</advancedConstraints>"
+ " <subjects> "
+ " {for $y in tokenize($subj,',') "
+ " return "
+ " <subject>{$y}</subject>} "
+ " {for $y in tokenize($fos,',') "
+ " return "
+ " <subject>{$y}</subject>} "
+ " {for $y in tokenize($sdg,',') "
+ " return "
+ " <subject>{$y}</subject>} "
+ " </subjects> "
+ " <datasources> "
+ " {for $d in $datasources "
+ " where $d/param[./@name='enabled']/text()='true' "
+ " return "
+ " <datasource> "
+ " <openaireId> "
+ " {$d//param[./@name='openaireId']/text()} "
+ " </openaireId> "
+ " <selcriteria> "
+ " {$d/param[./@name='selcriteria']/text()} "
+ " </selcriteria> "
+ " </datasource> } "
+ " </datasources> " +
" <zenodocommunities> " +
"{for $zc in $zenodo " +
"return " +
"<zenodocommunity> " +
"<zenodoid> " +
"{$zc} " +
"</zenodoid> " +
"</zenodocommunity>}"
+ " {for $zc in $communities "
+ " return "
+ " <zenodocommunity> "
+ " <zenodoid> "
+ " {$zc/param[./@name='zenodoid']/text()} "
+ " </zenodoid> "
+ " <selcriteria> "
+ " {$zc/param[./@name='selcriteria']/text()} "
+ " </selcriteria> "
+ " </zenodocommunity>} "
+ " </zenodocommunities> "
+ "<advancedConstraint>"
+ "{$x//CONFIGURATION/context/param[./@name='advancedConstraint']/text()} "
+ "</advancedConstraint>"
+ " </community>";
public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl) public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
throws ISLookUpException, DocumentException, SAXException { throws ISLookUpException, DocumentException, SAXException, IOException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
final List<String> res = isLookUp.quickSearchProfile(XQUERY); final List<String> res = isLookUp
.quickSearchProfile(
IOUtils
.toString(
QueryInformationSystem.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/query.xq")));
final String xmlConf = "<communities>" + Joiner.on(" ").join(res) + "</communities>"; final String xmlConf = "<communities>" + Joiner.on(" ").join(res) + "</communities>";

View File

@ -102,7 +102,7 @@ public class PrepareResultInstRepoAssociation {
String query = "SELECT source datasourceId, target organizationId " String query = "SELECT source datasourceId, target organizationId "
+ "FROM ( SELECT id " + "FROM ( SELECT id "
+ "FROM datasource " + "FROM datasource "
+ "WHERE datasourcetype.classid = '" + "WHERE lower(jurisdiction.classid) = '"
+ INSTITUTIONAL_REPO_TYPE + INSTITUTIONAL_REPO_TYPE
+ "' " + "' "
+ "AND datainfo.deletedbyinference = false " + blacklisted + " ) d " + "AND datainfo.deletedbyinference = false " + blacklisted + " ) d "

View File

@ -38,13 +38,13 @@
{ {
"paramName": "test", "paramName": "test",
"paramLongName": "isTest", "paramLongName": "isTest",
"paramDescription": "true if the spark session is managed, false otherwise", "paramDescription": "Parameter intended for testing purposes only. True if the reun is relatesd to a test and so the taggingConf parameter should be loaded",
"paramRequired": false "paramRequired": false
}, },
{ {
"paramName": "tg", "paramName": "tg",
"paramLongName": "taggingConf", "paramLongName": "taggingConf",
"paramDescription": "true if the spark session is managed, false otherwise", "paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed",
"paramRequired": false "paramRequired": false
} }

View File

@ -219,7 +219,7 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait" to="eosc_tag"/> <join name="wait" to="End"/>
<action name="eosc_tag"> <action name="eosc_tag">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">

View File

@ -0,0 +1,59 @@
for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')
let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text()
let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept
let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept
let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept
let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text()
let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text()
let $zenodo := $x//param[./@name='zenodoCommunity']/text()
where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden'
return
<community>
{ $x//CONFIGURATION/context/@id}
<advancedConstraints>
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
</advancedConstraints>
<subjects>
{for $y in tokenize($subj,',')
return
<subject>{$y}</subject>}
{for $y in tokenize($fos,',')
return
<subject>{$y}</subject>}
{for $y in tokenize($sdg,',')
return
<subject>{$y}</subject>}
</subjects>
<datasources>
{for $d in $datasources
where $d/param[./@name='enabled']/text()='true'
return
<datasource>
<openaireId>
{$d//param[./@name='openaireId']/text()}
</openaireId>
<selcriteria>
{$d/param[./@name='selcriteria']/text()}
</selcriteria>
</datasource> }
</datasources>
<zenodocommunities>
{for $zc in $zenodo
return
<zenodocommunity>
<zenodoid>
{$zc}
</zenodoid>
</zenodocommunity>}
{for $zc in $communities
return
<zenodocommunity>
<zenodoid>
{$zc/param[./@name='zenodoid']/text()}
</zenodoid>
<selcriteria>
{$zc/param[./@name='selcriteria']/text()}
</selcriteria>
</zenodocommunity>}
</zenodocommunities>
</community>

View File

@ -5,6 +5,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -147,4 +148,44 @@ public class CleanCountryTest {
.size()); .size());
} }
@Test
public void testDatasetClean() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json")
.getPath();
spark
.read()
.textFile(sourcePath)
.map(
(MapFunction<String, Dataset>) r -> OBJECT_MAPPER.readValue(r, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.json(workingDir.toString() + "/dataset");
CleanCountrySparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/dataset",
"-graphTableClassName", Dataset.class.getCanonicalName(),
"-workingDir", workingDir.toString() + "/working",
"-country", "NL",
"-verifyParam", "10.17632",
"-collectedfrom", "NARCIS",
"-hostedBy", getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
.getPath()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(1, tmp.count());
Assertions.assertEquals(0, tmp.first().getCountry().size());
}
} }

File diff suppressed because one or more lines are too long