graph cleaning refactoring #282

Merged
claudio.atzori merged 10 commits from graph_cleaning_refactoring into beta 2023-05-02 10:40:03 +02:00
16 changed files with 1980 additions and 257 deletions
Showing only changes of commit 851f664bd9 - Show all commits

View File

@ -3,6 +3,8 @@ package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
@ -13,6 +15,7 @@ import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModelList;
import okhttp3.*;
import org.jetbrains.annotations.NotNull;
public class ZenodoAPIClient implements Serializable {
@ -60,33 +63,31 @@ public class ZenodoAPIClient implements Serializable {
*/
public int newDeposition() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
// Get response body
json = response.body().string();
ZenodoModel newSubmission = new Gson().fromJson(json, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return response.code();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if(!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel newSubmission = new Gson().fromJson(body, ZenodoModel.class);
this.bucket = newSubmission.getLinks().getBucket();
this.deposition_id = newSubmission.getId();
return responseCode;
}
/**
@ -94,28 +95,48 @@ public class ZenodoAPIClient implements Serializable {
*
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @param len the size of the file
* @return the response code
*/
public int uploadIS(InputStream is, String file_name, long len) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder()
.writeTimeout(600, TimeUnit.SECONDS)
.readTimeout(600, TimeUnit.SECONDS)
.connectTimeout(600, TimeUnit.SECONDS)
.build();
public int uploadIS(InputStream is, String file_name) throws IOException {
Request request = new Request.Builder()
.url(bucket + "/" + file_name)
.addHeader(HttpHeaders.CONTENT_TYPE, "application/zip") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
.build();
URL url = new URL(bucket + "/" + file_name);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, "application/zip");
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
byte[] buf = new byte[8192];
int length;
try (OutputStream os = conn.getOutputStream()) {
while ((length = is.read(buf)) != -1) {
os.write(buf, 0, length);
}
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
}
int responseCode = conn.getResponseCode();
if(! checkOKStatus(responseCode)){
throw new IOException("Unexpected code " + responseCode + getBody(conn));
}
return responseCode;
}
@NotNull
private String getBody(HttpURLConnection conn) throws IOException {
String body = "{}";
try (BufferedReader br = new BufferedReader(
new InputStreamReader(conn.getInputStream(), "utf-8"))) {
StringBuilder response = new StringBuilder();
String responseLine = null;
while ((responseLine = br.readLine()) != null) {
response.append(responseLine.trim());
}
body = response.toString();
}
return body;
}
/**
@ -127,26 +148,36 @@ public class ZenodoAPIClient implements Serializable {
*/
public int sendMretadata(String metadata) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("PUT");
RequestBody body = RequestBody.create(metadata, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.code();
try (OutputStream os = conn.getOutputStream()) {
byte[] input = metadata.getBytes("utf-8");
os.write(input, 0, input.length);
}
final int responseCode = conn.getResponseCode();
conn.disconnect();
if(!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + getBody(conn));
return responseCode;
}
private boolean checkOKStatus(int responseCode) {
if(HttpURLConnection.HTTP_OK != responseCode ||
HttpURLConnection.HTTP_CREATED != responseCode)
return true ;
return false;
}
/**
@ -155,6 +186,7 @@ public class ZenodoAPIClient implements Serializable {
* @return response code
* @throws IOException
*/
@Deprecated
public int publish() throws IOException {
String json = "{}";
@ -194,28 +226,35 @@ public class ZenodoAPIClient implements Serializable {
setDepositionId(concept_rec_id, 1);
String json = "{}";
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
URL url = new URL(urlString + "/" + deposition_id + "/actions/newversion");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("POST");
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return response.code();
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if(!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
String latest_draft = zenodoModel.getLinks().getLatest_draft();
deposition_id = latest_draft.substring(latest_draft.lastIndexOf("/") + 1);
bucket = getBucket(latest_draft);
return responseCode;
}
/**
@ -233,24 +272,33 @@ public class ZenodoAPIClient implements Serializable {
this.deposition_id = deposition_id;
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
String json = "{}";
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Authorization", "Bearer " + access_token)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return response.code();
URL url = new URL(urlString + "/" + deposition_id);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setRequestMethod("POST");
conn.setDoOutput(true);
try (OutputStream os = conn.getOutputStream()) {
byte[] input = json.getBytes("utf-8");
os.write(input, 0, input.length);
}
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if(!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return responseCode;
}
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
@ -273,53 +321,56 @@ public class ZenodoAPIClient implements Serializable {
private String getPrevDepositions(String page) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
urlBuilder.addQueryParameter("page", page);
String url = urlBuilder.build().toString();
Request request = new Request.Builder()
.url(url)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();
URL url = new URL(urlBuilder.build().toString());
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
return response.body().string();
String body = getBody(conn);
int responseCode = conn.getResponseCode();
conn.disconnect();
if(!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
return body;
}
}
private String getBucket(String url) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder()
.connectTimeout(600, TimeUnit.SECONDS)
.build();
private String getBucket(String inputUurl) throws IOException {
Request request = new Request.Builder()
.url(url)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();
URL url = new URL(inputUurl);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString());
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + access_token);
conn.setDoOutput(true);
conn.setRequestMethod("GET");
try (Response response = httpClient.newCall(request).execute()) {
String body = getBody(conn);
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
int responseCode = conn.getResponseCode();
// Get response body
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
conn.disconnect();
if(!checkOKStatus(responseCode))
throw new IOException("Unexpected code " + responseCode + body);
ZenodoModel zenodoModel = new Gson().fromJson(body, ZenodoModel.class);
return zenodoModel.getLinks().getBucket();
return zenodoModel.getLinks().getBucket();
}
}

View File

@ -33,7 +33,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
@ -56,7 +56,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz"));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
@ -80,7 +80,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());
@ -100,7 +100,7 @@ class ZenodoAPIClientTest {
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition", file.length()));
Assertions.assertEquals(200, client.uploadIS(is, "newVersion_deposition"));
Assertions.assertEquals(202, client.publish());

View File

@ -16,7 +16,7 @@ public class Community implements Serializable {
private List<String> subjects = new ArrayList<>();
private List<Provider> providers = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
private SelectionConstraints constraints = new SelectionConstraints();
private SelectionConstraints constraints;
public String toJson() {
final Gson g = new Gson();
@ -26,7 +26,8 @@ public class Community implements Serializable {
public boolean isValid() {
return !getSubjects().isEmpty()
|| !getProviders().isEmpty()
|| !getZenodoCommunities().isEmpty();
|| !getZenodoCommunities().isEmpty()
|| !getConstraints().getCriteria().isEmpty();
}
public String getId() {

View File

@ -37,7 +37,7 @@ public class CommunityConfigurationFactory {
public static CommunityConfiguration newInstance(final String xml) throws DocumentException, SAXException {
log.debug(String.format("parsing community configuration from:\n%s", xml));
log.info(String.format("parsing community configuration from:\n%s", xml));
final SAXReader reader = new SAXReader();
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
@ -92,12 +92,13 @@ public class CommunityConfigurationFactory {
private static SelectionConstraints parseConstrains(Node node) {
Node advConstsNode = node.selectSingleNode("./advancedConstraints");
if (advConstsNode == null || StringUtils.isBlank(StringUtils.trim(advConstsNode.getText()))) {
return null;
return new SelectionConstraints();
}
SelectionConstraints selectionConstraints = new Gson()
.fromJson(advConstsNode.getText(), SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
log.info("number of selection constraints set " + selectionConstraints.getCriteria().size());
return selectionConstraints;
}

View File

@ -10,11 +10,14 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.bulktag.SparkBulkTagJob;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
@ -22,6 +25,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
/** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ResultTagger.class);
private boolean clearContext(Result result) {
int tmp = result.getContext().size();
@ -149,6 +153,8 @@ public class ResultTagger implements Serializable {
});
communities.addAll(aconstraints);
if (aconstraints.size() > 0)
log.info("Found {} for advancedConstraints ", aconstraints.size());
clearContext(result);

View File

@ -0,0 +1,31 @@
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
import java.util.Locale;
@VerbClass("starts_with_caseinsensitive")
public class StartsWithVerbIgnoreCase implements Selection, Serializable {
private String param;
public StartsWithVerbIgnoreCase() {
}
public StartsWithVerbIgnoreCase(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.toLowerCase().startsWith(param.toLowerCase());
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -16,6 +16,7 @@ import javax.print.attribute.DocAttributeSet;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -34,6 +35,7 @@ import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
@ -44,6 +46,11 @@ public class SparkEoscBulkTag implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static String OPENAIRE_3 = "openaire3.0";
private static String OPENAIRE_4 = "openaire-pub_4.0";
private static String OPENAIRE_CRIS = "openaire-cris_1.1";
private static String OPENAIRE_DATA = "openaire2.0_data";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
@ -72,6 +79,9 @@ public class SparkEoscBulkTag implements Serializable {
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String resultType = parser.get("resultType");
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
@ -82,41 +92,71 @@ public class SparkEoscBulkTag implements Serializable {
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, workingPath);
execBulkTag(spark, inputPath, workingPath, datasourceMapPath, resultClazz);
selectCompliantDatasources(spark, inputPath, workingPath, datasourceMapPath);
execBulkTag(spark, inputPath, workingPath, resultType, resultClazz);
});
}
private static void selectCompliantDatasources(SparkSession spark, String inputPath, String workingPath,
String datasourceMapPath) {
Dataset<Datasource> datasources = readPath(spark, inputPath + "datasource", Datasource.class)
.filter((FilterFunction<Datasource>) ds -> {
final String compatibility = ds.getOpenairecompatibility().getClassid();
return compatibility.equalsIgnoreCase(OPENAIRE_3) ||
compatibility.equalsIgnoreCase(OPENAIRE_4) ||
compatibility.equalsIgnoreCase(OPENAIRE_CRIS) ||
compatibility.equalsIgnoreCase(OPENAIRE_DATA);
});
Dataset<DatasourceMaster> datasourceMaster = readPath(spark, datasourceMapPath, DatasourceMaster.class);
datasources
.joinWith(datasourceMaster, datasources.col("id").equalTo(datasourceMaster.col("master")), "left")
.map(
(MapFunction<Tuple2<Datasource, DatasourceMaster>, DatasourceMaster>) t2 -> t2._2(),
Encoders.bean(DatasourceMaster.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "datasource");
}
private static <R extends Result> void execBulkTag(
SparkSession spark,
String inputPath,
String workingPath,
String datasourceMapPath,
String resultType,
Class<R> resultClazz) {
List<String> hostedByList = readPath(spark, datasourceMapPath, DatasourceMaster.class)
List<String> hostedByList = readPath(spark, workingPath + "datasource", DatasourceMaster.class)
.map((MapFunction<DatasourceMaster, String>) dm -> dm.getMaster(), Encoders.STRING())
.collectAsList();
readPath(spark, inputPath, resultClazz)
.map(patchResult(), Encoders.bean(resultClazz))
.filter(Objects::nonNull)
readPath(spark, inputPath + resultType, resultClazz)
.map(
(MapFunction<R, R>) value -> enrich(value, hostedByList),
Encoders.bean(resultClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
.json(workingPath + resultType);
readPath(spark, workingPath, resultClazz)
readPath(spark, workingPath + resultType, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
.json(inputPath + resultType);
}
private static <R extends Result> R enrich(R value, List<String> hostedByList) {
if (value.getDataInfo().getDeletedbyinference() == null) {
value.getDataInfo().setDeletedbyinference(false);
}
if (value.getContext() == null) {
value.setContext(new ArrayList<>());
}
if (value
.getInstance()
.stream()

View File

@ -29,6 +29,13 @@
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "rt",
"paramLongName": "resultType",
"paramDescription": "the result type",
"paramRequired": true
}
]

View File

@ -219,7 +219,7 @@
<error to="Kill"/>
</action>
<join name="wait" to="End"/>
<join name="wait" to="eosc_tag"/>
<action name="eosc_tag">
<spark xmlns="uri:oozie:spark-action:0.2">
@ -282,8 +282,9 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/publication</arg>
<arg>--sourcePath</arg><arg>${outputPath}/</arg>
<arg>--resultType</arg><arg>publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
@ -308,8 +309,9 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/dataset</arg>
<arg>--sourcePath</arg><arg>${outputPath}/</arg>
<arg>--resultType</arg><arg>dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
@ -333,8 +335,9 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/software</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/software</arg>
<arg>--sourcePath</arg><arg>${outputPath}/</arg>
<arg>--resultType</arg><arg>software</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
@ -358,8 +361,9 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/otherresearchproduct</arg>
<arg>--sourcePath</arg><arg>${outputPath}/</arg>
<arg>--resultType</arg><arg>otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>

View File

@ -47,8 +47,8 @@ public class BulkTagJobTest {
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\", "
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='subject:fos')].value\"} ";
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='FOS')].value\"" +
"} ";
private static SparkSession spark;
@ -64,7 +64,7 @@ public class BulkTagJobTest {
.toString(
BulkTagJobTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf.xml"));
"/eu/dnetlib/dhp/bulktag/communityconfiguration/tagging_conf_dth.xml"));
} catch (IOException e) {
e.printStackTrace();
}
@ -758,7 +758,7 @@ public class BulkTagJobTest {
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
Assertions.assertEquals(12, tmp.count());
org.apache.spark.sql.Dataset<Dataset> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Dataset.class));
@ -772,14 +772,14 @@ public class BulkTagJobTest {
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(5, idExplodeCommunity.count());
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions
.assertEquals(
2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
// Assertions.assertEquals(5, idExplodeCommunity.count());
//
// Assertions
// .assertEquals(
// 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
// Assertions
// .assertEquals(
// 2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
}
}

View File

@ -843,88 +843,136 @@
</zenodocommunities>
<organizations/>
</community>
<community id="dariah">
<community id="dth">
<advancedConstraints>
{
"criteria": [
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
}
]
}
{"criteria":[
{"constraint":[
{"verb":"equals_caseinsensitive","field":"subject","value":"digital twins"},
{"verb":"contains_caseinsensitive","field":"subject","value":"health"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"structural"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"marine"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"avionics"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"battery"}
]},
{"constraint":[
{"verb":"contains_caseinsensitive","field":"title","value":"Human Digital Twins"}
]},
{"constraint":[
{"verb":"contains_caseinsensitive","field":"description","value":"Human Digital Twins"}
]},
{"constraint":[
{"verb":"equals_caseinsensitive","field":"subject","value":"Human Digital Twins"}
]},
{"constraint":[
{"verb":"contains_caseinsensitive","field":"title","value":"Virtual Human Twin"}
]},
{"constraint":[
{"verb":"contains_caseinsensitive","field":"description","value":"Virtual Human Twin"}
]},
{"constraint":[
{"verb":"equals_caseinsensitive","field":"subject","value":"Virtual Human Twin"}
]},
{"constraint":[
{"verb":"equals_caseinsensitive","field":"subject","value":"digital twin"},
{"verb":"contains_caseinsensitive","field":"subject","value":"health"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"structural"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"marine"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"avionics"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"battery"}
]},
{"constraint":[
{"verb":"contains_caseinsensitive","field":"title","value":"digital twin health"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Acoustic"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Health Monitoring"},
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health Monitoring"},
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health Management"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Health Assessment"},
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health Assessment"},
{"verb":"not_contains_caseinsensitive","field":"title","value":"Health status"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"ELECTRICAL ENGINEERING"},
{"verb":"not_contains_caseinsensitive","field":"subject","value":"Control and Systems Engineering"}
]}
]}
<!-- {-->
<!-- "criteria": [-->
<!-- {-->
<!-- "constraint": [-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "North America"-->
<!-- },-->
<!-- {-->
<!-- "verb": "contains",-->
<!-- "field": "fos",-->
<!-- "value": "05"-->
<!-- }-->
<!-- ]-->
<!-- },-->
<!-- {-->
<!-- "constraint": [-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "North America"-->
<!-- },-->
<!-- {-->
<!-- "verb": "contains",-->
<!-- "field": "fos",-->
<!-- "value": "06"-->
<!-- }-->
<!-- ]-->
<!-- },-->
<!-- {-->
<!-- "constraint": [-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "Mexico"-->
<!-- },-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "United States"-->
<!-- },-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "Canada"-->
<!-- },-->
<!-- {-->
<!-- "verb": "contains",-->
<!-- "field": "fos",-->
<!-- "value": "05"-->
<!-- }-->
<!-- ]-->
<!-- },-->
<!-- {-->
<!-- "constraint": [-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "Mexico"-->
<!-- },-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "United States"-->
<!-- },-->
<!-- {-->
<!-- "verb": "equals_caseinsensitive",-->
<!-- "field": "subject",-->
<!-- "value": "Canada"-->
<!-- },-->
<!-- {-->
<!-- "verb": "contains",-->
<!-- "field": "fos",-->
<!-- "value": "06"-->
<!-- }-->
<!-- ]-->
<!-- }-->
<!-- ]-->
<!-- }-->
</advancedConstraints>
<subjects/>

View File

@ -1,19 +1,10 @@
DROP VIEW IF EXISTS ${hiveDbName}.result;
CREATE VIEW IF NOT EXISTS ${hiveDbName}.result as
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount from ${hiveDbName}.publication p
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount, eoscifguidelines from ${hiveDbName}.publication p
union all
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount from ${hiveDbName}.dataset d
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount, eoscifguidelines from ${hiveDbName}.dataset d
union all
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount from ${hiveDbName}.software s
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount, eoscifguidelines from ${hiveDbName}.software s
union all
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount from ${hiveDbName}.otherresearchproduct o;
ANALYZE TABLE ${hiveDbName}.datasource COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.organization COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.project COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.publication COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.dataset COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.otherresearchproduct COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.software COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.relation COMPUTE STATISTICS;
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures, processingchargeamount, eoscifguidelines from ${hiveDbName}.otherresearchproduct o;

View File

@ -207,12 +207,22 @@ public class XmlRecordFactory implements Serializable {
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
.collect(Collectors.toList()));
}
if (entity.getMeasures() != null) {
metadata.addAll(measuresAsXml(entity.getMeasures()));
}
if (ModelSupport.isResult(type)) {
final Result r = (Result) entity;
if (r.getMeasures() != null) {
metadata.addAll(measuresAsXml(r.getMeasures()));
if (r.getFulltext() != null) {
metadata
.addAll(
r
.getFulltext()
.stream()
.filter(Objects::nonNull)
.map(c -> XmlSerializationUtils.asXmlElement("fulltext", c.getValue()))
.collect(Collectors.toList()));
}
if (r.getEoscifguidelines() != null) {

View File

@ -39,6 +39,18 @@
<description>query used in the deleted by query operation</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>sparkDriverMemoryForJoining</name>
<description>memory for driver process</description>
@ -565,9 +577,9 @@
<class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}