dnet-hadoop/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java

package eu.dnetlib.dhp.community;

import static eu.dnetlib.dhp.community.TagginConstants.*;

import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;

/** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable {

    private String trust = "0.8";

    private boolean clearContext(Result result) {
        int tmp = result.getContext().size();
        List<Context> clist =
                result.getContext().stream()
                        .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))
                        .collect(Collectors.toList());
        result.setContext(clist);
        return (tmp != clist.size());
    }

    private Map<String, List<String>> getParamMap(final Result result, Map<String, String> params) {
        Map<String, List<String>> param = new HashMap<>();
        String json = new Gson().toJson(result, Result.class);
        DocumentContext jsonContext = JsonPath.parse(json);
        if (params == null) {
            params = new HashMap<>();
        }
        for (String key : params.keySet()) {
            try {
                param.put(key, jsonContext.read(params.get(key)));
            } catch (com.jayway.jsonpath.PathNotFoundException e) {
                param.put(key, new ArrayList<>());
                // throw e;
            }
        }
        return param;
    }

    public <R extends Result> R enrichContextCriteria(
            final R result, final CommunityConfiguration conf, final Map<String, String> criteria) {

        //    }
        //    public Result enrichContextCriteria(final Result result, final CommunityConfiguration
        // conf, final Map<String,String> criteria) {
        final Map<String, List<String>> param = getParamMap(result, criteria);

        // Verify if the entity is deletedbyinference. In case verify if to clean the context list
        // from all the zenodo communities
        if (result.getDataInfo().getDeletedbyinference()) {
            clearContext(result);
            return result;
        }

        // communities contains all the communities to be added as context for the result
        final Set<String> communities = new HashSet<>();

        // tagging for Subject
        final Set<String> subjects = new HashSet<>();
        Optional<List<StructuredProperty>> oresultsubj = Optional.ofNullable(result.getSubject());
        if (oresultsubj.isPresent()) {
            oresultsubj.get().stream()
                    .map(subject -> subject.getValue())
                    .filter(StringUtils::isNotBlank)
                    .map(String::toLowerCase)
                    .map(String::trim)
                    .collect(Collectors.toCollection(HashSet::new))
                    .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
        }

        communities.addAll(subjects);

        // Tagging for datasource
        final Set<String> datasources = new HashSet<>();
        final Set<String> tmp = new HashSet<>();

        Optional<List<Instance>> oresultinstance = Optional.ofNullable(result.getInstance());
        if (oresultinstance.isPresent()) {
            for (Instance i : oresultinstance.get()) {
                tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|"));
                tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|"));
            }

            oresultinstance.get().stream()
                    .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
                    .flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
                    .map(s -> StringUtils.substringAfter(s, "|"))
                    .collect(Collectors.toCollection(HashSet::new))
                    .forEach(
                            dsId ->
                                    datasources.addAll(
                                            conf.getCommunityForDatasource(dsId, param)));
        }

        communities.addAll(datasources);

        /*Tagging for Zenodo Communities*/
        final Set<String> czenodo = new HashSet<>();

        Optional<List<Context>> oresultcontext = Optional.ofNullable(result.getContext());
        if (oresultcontext.isPresent()) {
            oresultcontext.get().stream()
                    .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
                    .collect(Collectors.toList())
                    .forEach(
                            c ->
                                    czenodo.addAll(
                                            conf.getCommunityForZenodoCommunityValue(
                                                    c.getId()
                                                            .substring(
                                                                    c.getId().lastIndexOf("/") + 1)
                                                            .trim())));
        }

        communities.addAll(czenodo);

        clearContext(result);

        /*Verify if there is something to bulktag*/
        if (communities.isEmpty()) {
            return result;
        }

        result.getContext().stream()
                .map(
                        c -> {
                            if (communities.contains(c.getId())) {
                                Optional<List<DataInfo>> opt_dataInfoList =
                                        Optional.ofNullable(c.getDataInfo());
                                List<DataInfo> dataInfoList;
                                if (opt_dataInfoList.isPresent())
                                    dataInfoList = opt_dataInfoList.get();
                                else {
                                    dataInfoList = new ArrayList<>();
                                    c.setDataInfo(dataInfoList);
                                }
                                if (subjects.contains(c.getId()))
                                    dataInfoList.add(
                                            getDataInfo(
                                                    BULKTAG_DATA_INFO_TYPE,
                                                    CLASS_ID_SUBJECT,
                                                    CLASS_NAME_BULKTAG_SUBJECT));
                                if (datasources.contains(c.getId()))
                                    dataInfoList.add(
                                            getDataInfo(
                                                    BULKTAG_DATA_INFO_TYPE,
                                                    CLASS_ID_DATASOURCE,
                                                    CLASS_NAME_BULKTAG_DATASOURCE));
                                if (czenodo.contains(c.getId()))
                                    dataInfoList.add(
                                            getDataInfo(
                                                    BULKTAG_DATA_INFO_TYPE,
                                                    CLASS_ID_CZENODO,
                                                    CLASS_NAME_BULKTAG_ZENODO));
                            }
                            return c;
                        })
                .collect(Collectors.toList());

        communities.removeAll(
                result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));

        if (communities.isEmpty()) return result;

        List<Context> toaddcontext =
                communities.stream()
                        .map(
                                c -> {
                                    Context context = new Context();
                                    context.setId(c);
                                    List<DataInfo> dataInfoList = new ArrayList<>();
                                    if (subjects.contains(c))
                                        dataInfoList.add(
                                                getDataInfo(
                                                        BULKTAG_DATA_INFO_TYPE,
                                                        CLASS_ID_SUBJECT,
                                                        CLASS_NAME_BULKTAG_SUBJECT));
                                    if (datasources.contains(c))
                                        dataInfoList.add(
                                                getDataInfo(
                                                        BULKTAG_DATA_INFO_TYPE,
                                                        CLASS_ID_DATASOURCE,
                                                        CLASS_NAME_BULKTAG_DATASOURCE));
                                    if (czenodo.contains(c))
                                        dataInfoList.add(
                                                getDataInfo(
                                                        BULKTAG_DATA_INFO_TYPE,
                                                        CLASS_ID_CZENODO,
                                                        CLASS_NAME_BULKTAG_ZENODO));
                                    context.setDataInfo(dataInfoList);
                                    return context;
                                })
                        .collect(Collectors.toList());

        result.getContext().addAll(toaddcontext);
        return result;
    }

    public static DataInfo getDataInfo(
            String inference_provenance, String inference_class_id, String inference_class_name) {
        DataInfo di = new DataInfo();
        di.setInferred(true);
        di.setInferenceprovenance(inference_provenance);
        di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
        return di;
    }

    public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
        Qualifier pa = new Qualifier();
        pa.setClassid(inference_class_id);
        pa.setClassname(inference_class_name);
        pa.setSchemeid(DNET_SCHEMA_ID);
        pa.setSchemename(DNET_SCHEMA_NAME);
        return pa;
    }
}
moved some classes to package to make code clearer 2020-03-03 16:42:23 +01:00			`package eu.dnetlib.dhp.community;`
added shaded libs module 2020-04-21 16:03:51 +02:00
			`import static eu.dnetlib.dhp.community.TagginConstants.*;`

implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`import com.google.gson.Gson;`
			`import com.jayway.jsonpath.DocumentContext;`
			`import com.jayway.jsonpath.JsonPath;`
			`import eu.dnetlib.dhp.schema.oaf.*;`
added shaded libs module 2020-04-21 16:03:51 +02:00			`import java.io.Serializable;`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`import java.util.*;`
			`import java.util.stream.Collectors;`
			`import java.util.stream.Stream;`
added shaded libs module 2020-04-21 16:03:51 +02:00			`import org.apache.commons.lang3.StringUtils;`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
added shaded libs module 2020-04-21 16:03:51 +02:00			`/** Created by miriam on 02/08/2018. */`
			`public class ResultTagger implements Serializable {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
			`private String trust = "0.8";`

added shaded libs module 2020-04-21 16:03:51 +02:00			`private boolean clearContext(Result result) {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`int tmp = result.getContext().size();`
added shaded libs module 2020-04-21 16:03:51 +02:00			`List<Context> clist =`
			`result.getContext().stream()`
			`.filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR)))`
			`.collect(Collectors.toList());`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`result.setContext(clist);`
			`return (tmp != clist.size());`
			`}`

added shaded libs module 2020-04-21 16:03:51 +02:00			`private Map<String, List<String>> getParamMap(final Result result, Map<String, String> params) {`
			`Map<String, List<String>> param = new HashMap<>();`
			`String json = new Gson().toJson(result, Result.class);`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`DocumentContext jsonContext = JsonPath.parse(json);`
added shaded libs module 2020-04-21 16:03:51 +02:00			`if (params == null) {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`params = new HashMap<>();`
			`}`
added shaded libs module 2020-04-21 16:03:51 +02:00			`for (String key : params.keySet()) {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`try {`
			`param.put(key, jsonContext.read(params.get(key)));`
			`} catch (com.jayway.jsonpath.PathNotFoundException e) {`
			`param.put(key, new ArrayList<>());`
added shaded libs module 2020-04-21 16:03:51 +02:00			`// throw e;`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`}`
			`}`
			`return param;`
			`}`

added shaded libs module 2020-04-21 16:03:51 +02:00			`public <R extends Result> R enrichContextCriteria(`
			`final R result, final CommunityConfiguration conf, final Map<String, String> criteria) {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
added shaded libs module 2020-04-21 16:03:51 +02:00			`// }`
			`// public Result enrichContextCriteria(final Result result, final CommunityConfiguration`
			`// conf, final Map<String,String> criteria) {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`final Map<String, List<String>> param = getParamMap(result, criteria);`

added shaded libs module 2020-04-21 16:03:51 +02:00			`// Verify if the entity is deletedbyinference. In case verify if to clean the context list`
			`// from all the zenodo communities`
			`if (result.getDataInfo().getDeletedbyinference()) {`
added the implements Serializable to each class 2020-04-23 11:48:47 +02:00			`clearContext(result);`
			`return result;`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`}`

added shaded libs module 2020-04-21 16:03:51 +02:00			`// communities contains all the communities to be added as context for the result`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`final Set<String> communities = new HashSet<>();`

added shaded libs module 2020-04-21 16:03:51 +02:00			`// tagging for Subject`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`final Set<String> subjects = new HashSet<>();`
refactoring 2020-04-24 10:47:43 +02:00			`Optional<List<StructuredProperty>> oresultsubj = Optional.ofNullable(result.getSubject());`
			`if (oresultsubj.isPresent()) {`
			`oresultsubj.get().stream()`
			`.map(subject -> subject.getValue())`
			`.filter(StringUtils::isNotBlank)`
			`.map(String::toLowerCase)`
			`.map(String::trim)`
			`.collect(Collectors.toCollection(HashSet::new))`
			`.forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));`
			`}`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
			`communities.addAll(subjects);`

added shaded libs module 2020-04-21 16:03:51 +02:00			`// Tagging for datasource`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`final Set<String> datasources = new HashSet<>();`
			`final Set<String> tmp = new HashSet<>();`

refactoring 2020-04-24 10:47:43 +02:00			`Optional<List<Instance>> oresultinstance = Optional.ofNullable(result.getInstance());`
			`if (oresultinstance.isPresent()) {`
			`for (Instance i : oresultinstance.get()) {`
			`tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "\|"));`
			`tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "\|"));`
			`}`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
refactoring 2020-04-24 10:47:43 +02:00			`oresultinstance.get().stream()`
			`.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))`
			`.flatMap(p -> Stream.of(p.getFst(), p.getSnd()))`
			`.map(s -> StringUtils.substringAfter(s, "\|"))`
			`.collect(Collectors.toCollection(HashSet::new))`
			`.forEach(`
			`dsId ->`
			`datasources.addAll(`
			`conf.getCommunityForDatasource(dsId, param)));`
			`}`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
			`communities.addAll(datasources);`

			`/Tagging for Zenodo Communities/`
			`final Set<String> czenodo = new HashSet<>();`
refactoring 2020-04-24 10:47:43 +02:00
			`Optional<List<Context>> oresultcontext = Optional.ofNullable(result.getContext());`
			`if (oresultcontext.isPresent()) {`
			`oresultcontext.get().stream()`
			`.filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))`
			`.collect(Collectors.toList())`
			`.forEach(`
			`c ->`
			`czenodo.addAll(`
			`conf.getCommunityForZenodoCommunityValue(`
			`c.getId()`
			`.substring(`
			`c.getId().lastIndexOf("/") + 1)`
			`.trim())));`
			`}`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
			`communities.addAll(czenodo);`

			`clearContext(result);`

			`/Verify if there is something to bulktag/`
added shaded libs module 2020-04-21 16:03:51 +02:00			`if (communities.isEmpty()) {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`return result;`
			`}`

added shaded libs module 2020-04-21 16:03:51 +02:00			`result.getContext().stream()`
			`.map(`
			`c -> {`
			`if (communities.contains(c.getId())) {`
added the implements Serializable to each class 2020-04-23 11:48:47 +02:00			`Optional<List<DataInfo>> opt_dataInfoList =`
			`Optional.ofNullable(c.getDataInfo());`
			`List<DataInfo> dataInfoList;`
			`if (opt_dataInfoList.isPresent())`
			`dataInfoList = opt_dataInfoList.get();`
			`else {`
			`dataInfoList = new ArrayList<>();`
			`c.setDataInfo(dataInfoList);`
			`}`
added shaded libs module 2020-04-21 16:03:51 +02:00			`if (subjects.contains(c.getId()))`
			`dataInfoList.add(`
			`getDataInfo(`
			`BULKTAG_DATA_INFO_TYPE,`
			`CLASS_ID_SUBJECT,`
			`CLASS_NAME_BULKTAG_SUBJECT));`
			`if (datasources.contains(c.getId()))`
			`dataInfoList.add(`
			`getDataInfo(`
			`BULKTAG_DATA_INFO_TYPE,`
			`CLASS_ID_DATASOURCE,`
			`CLASS_NAME_BULKTAG_DATASOURCE));`
			`if (czenodo.contains(c.getId()))`
			`dataInfoList.add(`
			`getDataInfo(`
			`BULKTAG_DATA_INFO_TYPE,`
			`CLASS_ID_CZENODO,`
			`CLASS_NAME_BULKTAG_ZENODO));`
			`}`
			`return c;`
			`})`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`.collect(Collectors.toList());`

added shaded libs module 2020-04-21 16:03:51 +02:00			`communities.removeAll(`
			`result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));`

			`if (communities.isEmpty()) return result;`

			`List<Context> toaddcontext =`
			`communities.stream()`
			`.map(`
			`c -> {`
			`Context context = new Context();`
			`context.setId(c);`
added the implements Serializable to each class 2020-04-23 11:48:47 +02:00			`List<DataInfo> dataInfoList = new ArrayList<>();`
added shaded libs module 2020-04-21 16:03:51 +02:00			`if (subjects.contains(c))`
			`dataInfoList.add(`
			`getDataInfo(`
			`BULKTAG_DATA_INFO_TYPE,`
			`CLASS_ID_SUBJECT,`
			`CLASS_NAME_BULKTAG_SUBJECT));`
			`if (datasources.contains(c))`
			`dataInfoList.add(`
			`getDataInfo(`
			`BULKTAG_DATA_INFO_TYPE,`
			`CLASS_ID_DATASOURCE,`
			`CLASS_NAME_BULKTAG_DATASOURCE));`
			`if (czenodo.contains(c))`
			`dataInfoList.add(`
			`getDataInfo(`
			`BULKTAG_DATA_INFO_TYPE,`
			`CLASS_ID_CZENODO,`
			`CLASS_NAME_BULKTAG_ZENODO));`
			`context.setDataInfo(dataInfoList);`
			`return context;`
			`})`
			`.collect(Collectors.toList());`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00
			`result.getContext().addAll(toaddcontext);`
			`return result;`
			`}`

added shaded libs module 2020-04-21 16:03:51 +02:00			`public static DataInfo getDataInfo(`
			`String inference_provenance, String inference_class_id, String inference_class_name) {`
implementation for bulk tagging 2020-03-03 16:38:50 +01:00			`DataInfo di = new DataInfo();`
			`di.setInferred(true);`
			`di.setInferenceprovenance(inference_provenance);`
			`di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));`
			`return di;`
			`}`

			`public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {`
			`Qualifier pa = new Qualifier();`
			`pa.setClassid(inference_class_id);`
			`pa.setClassname(inference_class_name);`
			`pa.setSchemeid(DNET_SCHEMA_ID);`
			`pa.setSchemename(DNET_SCHEMA_NAME);`
			`return pa;`
			`}`
added shaded libs module 2020-04-21 16:03:51 +02:00			`}`