FIxed merge of dhp dedup

2020-03-27 13:48:44 +01:00 · 2020-03-27 13:48:44 +01:00 · 15d9106b3f
parent 8c9a56a0c8
commit 15d9106b3f
29 changed files with 1181 additions and 46 deletions
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java
@ -0,0 +1,119 @@
 package eu.dnetlib.dedup;
 import eu.dnetlib.dhp.schema.oaf.Field;
 import org.apache.commons.lang.StringUtils;
 import java.time.Year;
 import java.util.*;
 import java.util.stream.Collectors;
 import static java.util.Collections.reverseOrder;
 import static java.util.Map.Entry.comparingByValue;
 import static java.util.stream.Collectors.toMap;
 import static org.apache.commons.lang.StringUtils.endsWith;
 import static org.apache.commons.lang.StringUtils.substringBefore;
 public class DatePicker {
    private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
    private static final String DATE_DEFAULT_SUFFIX = "01-01";
    private static final int YEAR_LB = 1300;
    private static final int YEAR_UB = Year.now().getValue() + 5;
    public static Field<String> pick(final Collection<String> dateofacceptance) {
        final Map<String, Integer> frequencies = dateofacceptance
                .parallelStream()
                .filter(StringUtils::isNotBlank)
                .collect(
                        Collectors.toConcurrentMap(
                                w -> w, w -> 1, Integer::sum));
        if (frequencies.isEmpty()) {
            return new Field<>();
        }
        final Field<String> date = new Field<>();
                date.setValue(frequencies.keySet().iterator().next());
        // let's sort this map by values first, filtering out invalid dates
        final Map<String, Integer> sorted = frequencies
                .entrySet()
                .stream()
                .filter(d -> StringUtils.isNotBlank(d.getKey()))
                .filter(d -> d.getKey().matches(DATE_PATTERN))
                .filter(d -> inRange(d.getKey()))
                .sorted(reverseOrder(comparingByValue()))
                .collect(
                        toMap(
                                Map.Entry::getKey,
                                Map.Entry::getValue, (e1, e2) -> e2,
                                LinkedHashMap::new));
        // shortcut
        if (sorted.size() == 0) {
            return date;
        }
        // voting method (1/3 + 1) wins
        if (sorted.size() >= 3) {
            final int acceptThreshold = (sorted.size() / 3) + 1;
            final List<String> accepted = sorted.entrySet().stream()
                    .filter(e -> e.getValue() >= acceptThreshold)
                    .map(e -> e.getKey())
                    .collect(Collectors.toList());
            // cannot find strong majority
            if (accepted.isEmpty()) {
                final int max = sorted.values().iterator().next();
                Optional<String> first = sorted.entrySet().stream()
                        .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX))
                        .map(Map.Entry::getKey)
                        .findFirst();
                if (first.isPresent()) {
                    date.setValue(first.get());
                    return date;
                }
                date.setValue(sorted.keySet().iterator().next());
                return date;
            }
            if (accepted.size() == 1) {
                date.setValue(accepted.get(0));
                return date;
            } else {
                final Optional<String> first = accepted.stream()
                        .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX))
                        .findFirst();
                if (first.isPresent()) {
                    date.setValue(first.get());
                    return date;
                }
                return date;
            }
            //1st non YYYY-01-01 is returned
        } else {
            if (sorted.size() == 2) {
                for (Map.Entry<String, Integer> e : sorted.entrySet()) {
                    if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) {
                        date.setValue(e.getKey());
                        return date;
                    }
                }
            }
            // none of the dates seems good enough, return the 1st one
            date.setValue(sorted.keySet().iterator().next());
            return date;
        }
    }
    private static boolean inRange(final String date) {
        final int year = Integer.parseInt(substringBefore(date, "-"));
        return year >= YEAR_LB && year <= YEAR_UB;
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java
@ -0,0 +1,283 @@
 package eu.dnetlib.dedup;
 import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.google.common.collect.Lists;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import scala.Tuple2;
 import java.util.Collection;
 public class DedupRecordFactory {
    public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) {
        long ts = System.currentTimeMillis();
        //<id, json_entity>
        final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
                .mapToPair((PairFunction<String, String, String>) it ->
                        new Tuple2<String, String>(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)
                );
        //<source, target>: source is the dedup_id, target is the id of the mergedIn
        JavaPairRDD<String, String> mergeRels = spark
                .read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class))
                .where("relClass=='merges'")
                .javaRDD()
                .mapToPair(
                        (PairFunction<Relation, String, String>) r ->
                                new Tuple2<String, String>(r.getTarget(), r.getSource())
                );
        //<dedup_id, json_entity_merged>
        final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
        JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
        switch (entityType) {
            case publication:
                return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts));
            case dataset:
                return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts));
            case project:
                return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts));
            case software:
                return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts));
            case datasource:
                return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts));
            case organization:
                return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts));
            case otherresearchproduct:
                return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts));
            default:
                return null;
        }
    }
    private static Publication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
        Publication p = new Publication(); //the result of the merge, to be returned at the end
        p.setId(e._1());
        final ObjectMapper mapper = new ObjectMapper();
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        final Collection<String> dateofacceptance = Lists.newArrayList();
        if (e._2() != null)
            e._2().forEach(pub -> {
                try {
                    Publication publication = mapper.readValue(pub, Publication.class);
                    p.mergeFrom(publication);
                    p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
                    //add to the list if they are not null
                    if (publication.getDateofacceptance() != null)
                        dateofacceptance.add(publication.getDateofacceptance().getValue());
                } catch (Exception exc) {
                    throw new RuntimeException(exc);
                }
            });
        p.setDateofacceptance(DatePicker.pick(dateofacceptance));
        if (p.getDataInfo() == null)
            p.setDataInfo(new DataInfo());
        p.getDataInfo().setTrust("0.9");
        p.setLastupdatetimestamp(ts);
        return p;
    }
    private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
        Dataset d = new Dataset(); //the result of the merge, to be returned at the end
        d.setId(e._1());
        final ObjectMapper mapper = new ObjectMapper();
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        final Collection<String> dateofacceptance = Lists.newArrayList();
        if (e._2() != null)
            e._2().forEach(dat -> {
                try {
                    Dataset dataset = mapper.readValue(dat, Dataset.class);
                    d.mergeFrom(dataset);
                    d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
                    //add to the list if they are not null
                    if (dataset.getDateofacceptance() != null)
                        dateofacceptance.add(dataset.getDateofacceptance().getValue());
                } catch (Exception exc) {
                    throw new RuntimeException(exc);
                }
            });
        d.setDateofacceptance(DatePicker.pick(dateofacceptance));
        if (d.getDataInfo() == null)
            d.setDataInfo(new DataInfo());
        d.getDataInfo().setTrust("0.9");
        d.setLastupdatetimestamp(ts);
        return d;
    }
    private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
        Project p = new Project(); //the result of the merge, to be returned at the end
        p.setId(e._1());
        final ObjectMapper mapper = new ObjectMapper();
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        if (e._2() != null)
            e._2().forEach(proj -> {
                try {
                    Project project = mapper.readValue(proj, Project.class);
                    p.mergeFrom(project);
                } catch (Exception exc) {
                    throw new RuntimeException(exc);
                }
            });
        if (p.getDataInfo() == null)
            p.setDataInfo(new DataInfo());
        p.getDataInfo().setTrust("0.9");
        p.setLastupdatetimestamp(ts);
        return p;
    }
    private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
        Software s = new Software(); //the result of the merge, to be returned at the end
        s.setId(e._1());
        final ObjectMapper mapper = new ObjectMapper();
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        final Collection<String> dateofacceptance = Lists.newArrayList();
        if (e._2() != null)
            e._2().forEach(soft -> {
                try {
                    Software software = mapper.readValue(soft, Software.class);
                    s.mergeFrom(software);
                    s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor()));
                    //add to the list if they are not null
                    if (software.getDateofacceptance() != null)
                        dateofacceptance.add(software.getDateofacceptance().getValue());
                } catch (Exception exc) {
                    throw new RuntimeException(exc);
                }
            });
        s.setDateofacceptance(DatePicker.pick(dateofacceptance));
        if (s.getDataInfo() == null)
            s.setDataInfo(new DataInfo());
        s.getDataInfo().setTrust("0.9");
        s.setLastupdatetimestamp(ts);
        return s;
    }
    private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
        Datasource d = new Datasource(); //the result of the merge, to be returned at the end
        d.setId(e._1());
        final ObjectMapper mapper = new ObjectMapper();
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        if (e._2() != null)
            e._2().forEach(dat -> {
                try {
                    Datasource datasource = mapper.readValue(dat, Datasource.class);
                    d.mergeFrom(datasource);
                } catch (Exception exc) {
                    throw new RuntimeException(exc);
                }
            });
        if (d.getDataInfo() == null)
            d.setDataInfo(new DataInfo());
        d.getDataInfo().setTrust("0.9");
        d.setLastupdatetimestamp(ts);
        return d;
    }
    private static Organization organizationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
        Organization o = new Organization(); //the result of the merge, to be returned at the end
        o.setId(e._1());
        final ObjectMapper mapper = new ObjectMapper();
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        StringBuilder trust = new StringBuilder("0.0");
        if (e._2() != null)
            e._2().forEach(pub -> {
                try {
                    Organization organization = mapper.readValue(pub, Organization.class);
                    final String currentTrust = organization.getDataInfo().getTrust();
                    if (!"1.0".equals(currentTrust)) {
                        trust.setLength(0);
                        trust.append(currentTrust);
                    }
                    o.mergeFrom(organization);
                } catch (Exception exc) {
                    throw new RuntimeException(exc);
                }
            });
        if (o.getDataInfo() == null)
        {
            o.setDataInfo(new DataInfo());
        }
        if (o.getDataInfo() == null)
            o.setDataInfo(new DataInfo());
        o.getDataInfo().setTrust("0.9");
        o.setLastupdatetimestamp(ts);
        return o;
    }
    private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e, final long ts) {
        OtherResearchProduct o = new OtherResearchProduct(); //the result of the merge, to be returned at the end
        o.setId(e._1());
        final ObjectMapper mapper = new ObjectMapper();
        mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        final Collection<String> dateofacceptance = Lists.newArrayList();
        if (e._2() != null)
            e._2().forEach(orp -> {
                try {
                    OtherResearchProduct otherResearchProduct = mapper.readValue(orp, OtherResearchProduct.class);
                    o.mergeFrom(otherResearchProduct);
                    o.setAuthor(DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor()));
                    //add to the list if they are not null
                    if (otherResearchProduct.getDateofacceptance() != null)
                        dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue());
                } catch (Exception exc) {
                    throw new RuntimeException(exc);
                }
            });
        if (o.getDataInfo() == null)
            o.setDataInfo(new DataInfo());
        o.setDateofacceptance(DatePicker.pick(dateofacceptance));
        o.getDataInfo().setTrust("0.9");
        o.setLastupdatetimestamp(ts);
        return o;
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java
@ -0,0 +1,217 @@
 package eu.dnetlib.dedup;
 import com.google.common.collect.Sets;
 import com.wcohen.ss.JaroWinkler;
 import eu.dnetlib.dhp.schema.oaf.Author;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
 import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.model.Person;
 import org.apache.commons.codec.binary.Hex;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.util.LongAccumulator;
 import scala.Tuple2;
 import java.io.IOException;
 import java.io.StringWriter;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.text.Normalizer;
 import java.util.*;
 import java.util.stream.Collectors;
 public class DedupUtility {
    private static final Double THRESHOLD = 0.95;
    public static Map<String, LongAccumulator> constructAccumulator(final DedupConfig dedupConf, final SparkContext context) {
        Map<String, LongAccumulator> accumulators = new HashMap<>();
        String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
        accumulators.put(acc1, context.longAccumulator(acc1));
        String acc2 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
        accumulators.put(acc2, context.longAccumulator(acc2));
        String acc3 = String.format("%s::%s", dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
        accumulators.put(acc3, context.longAccumulator(acc3));
        String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
        accumulators.put(acc4, context.longAccumulator(acc4));
        String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
        accumulators.put(acc5, context.longAccumulator(acc5));
        String acc6 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
        accumulators.put(acc6, context.longAccumulator(acc6));
        return accumulators;
    }
    public static JavaRDD<String> loadDataFromHDFS(String path, JavaSparkContext context) {
        return context.textFile(path);
    }
    public static void deleteIfExists(String path) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fileSystem = FileSystem.get(conf);
        if (fileSystem.exists(new Path(path))) {
            fileSystem.delete(new Path(path), true);
        }
    }
    public static DedupConfig loadConfigFromHDFS(String path) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fileSystem = FileSystem.get(conf);
        FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path)));
        return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name()));
    }
    static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
        final StringWriter sw = new StringWriter();
        try {
            IOUtils.copy(clazz.getResourceAsStream(filename), sw);
            return sw.toString();
        } catch (final IOException e) {
            throw new RuntimeException("cannot load resource from classpath: " + filename);
        }
    }
    static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
        return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
    }
    public static String md5(final String s) {
        try {
            final MessageDigest md = MessageDigest.getInstance("MD5");
            md.update(s.getBytes("UTF-8"));
            return new String(Hex.encodeHex(md.digest()));
        } catch (final Exception e) {
            System.err.println("Error creating id");
            return null;
        }
    }
    public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
        int pa = countAuthorsPids(a);
        int pb = countAuthorsPids(b);
        List<Author> base, enrich;
        int sa = authorsSize(a);
        int sb = authorsSize(b);
        if (pa == pb) {
            base = sa > sb ? a : b;
            enrich = sa > sb ? b : a;
        } else {
            base = pa > pb ? a : b;
            enrich = pa > pb ? b : a;
        }
        enrichPidFromList(base, enrich);
        return base;
    }
    private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
        if (base == null || enrich == null)
            return;
        final Map<String, Author> basePidAuthorMap = base.stream()
                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
                .flatMap(a -> a.getPid()
                        .stream()
                        .map(p -> new Tuple2<>(p.toComparableString(), a))
                ).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
        final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
                .stream()
                .filter(a -> a.getPid() != null && a.getPid().size() > 0)
                .flatMap(a -> a.getPid().stream().filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())).map(p -> new Tuple2<>(p, a)))
                .collect(Collectors.toList());
        pidToEnrich.forEach(a -> {
            Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
            if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
                Author r = simAuhtor.get()._2();
                r.getPid().add(a._1());
            }
        });
    }
    public static String createEntityPath(final String basePath, final String entityType) {
        return String.format("%s/%s", basePath, entityType);
    }
    public static String createSimRelPath(final String basePath, final String entityType) {
        return String.format("%s/%s/simRel", basePath, entityType);
    }
    public static String createMergeRelPath(final String basePath, final String entityType) {
        return String.format("%s/%s/mergeRel", basePath, entityType);
    }
    private static Double sim(Author a, Author b) {
        final Person pa = parse(a);
        final Person pb = parse(b);
        if (pa.isAccurate() & pb.isAccurate()) {
            return new JaroWinkler().score(
                    normalize(pa.getSurnameString()),
                    normalize(pb.getSurnameString()));
        } else {
            return new JaroWinkler().score(
                    normalize(pa.getNormalisedFullname()),
                    normalize(pb.getNormalisedFullname()));
        }
    }
    private static String normalize(final String s) {
        return nfd(s).toLowerCase()
                // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
                .replaceAll("(\\W)+", " ")
                .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
                .replaceAll("(\\p{Punct})+", " ")
                .replaceAll("(\\d)+", " ")
                .replaceAll("(\\n)+", " ")
                .trim();
    }
    private static String nfd(final String s) {
        return Normalizer.normalize(s, Normalizer.Form.NFD);
    }
    private static Person parse(Author author) {
        if (StringUtils.isNotBlank(author.getSurname())) {
            return new Person(author.getSurname() + ", " + author.getName(), false);
        } else {
            return new Person(author.getFullname(), false);
        }
    }
    private static int countAuthorsPids(List<Author> authors) {
        if (authors == null)
            return 0;
        return (int) authors.stream().filter(DedupUtility::hasPid).count();
    }
    private static int authorsSize(List<Author> authors) {
        if (authors == null)
            return 0;
        return authors.size();
    }
    private static boolean hasPid(Author a) {
        if (a == null || a.getPid() == null || a.getPid().size() == 0)
            return false;
        return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java
@ -0,0 +1,162 @@
 package eu.dnetlib.dedup;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.Field;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.util.BlockProcessor;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function2;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.util.LongAccumulator;
 import scala.Serializable;
 import scala.Tuple2;
 import java.util.*;
 import java.util.stream.Collectors;
 public class Deduper implements Serializable {
    private static final Log log = LogFactory.getLog(Deduper.class);
    /**
     * @return the list of relations generated by the deduplication
     * @param: the spark context
     * @param: list of JSON entities to be deduped
     * @param: the dedup configuration
     */
    public static JavaPairRDD<String, String> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
        Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
        //create vertexes of the graph: <ID, MapDocument>
        JavaPairRDD<String, MapDocument> mapDocs = mapToVertexes(context, entities, config);
        //create blocks for deduplication
        JavaPairRDD<String, Iterable<MapDocument>> blocks = createBlocks(context, mapDocs, config);
        //create relations by comparing only elements in the same group
        return computeRelations(context, blocks, config);
 //        final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(), it._2().hashCode(), "equalTo")).rdd();
 //
 //        RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair((PairFunction<Tuple2<String, MapDocument>, Object, MapDocument>) t -> new Tuple2<Object, MapDocument>((long) t._1().hashCode(), t._2())).rdd();
 //        accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
 //
 //        return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
    }
    /**
     * @return the list of relations generated by the deduplication
     * @param: the spark context
     * @param: list of blocks
     * @param: the dedup configuration
     */
    public static JavaPairRDD<String, String> computeRelations(JavaSparkContext context, JavaPairRDD<String, Iterable<MapDocument>> blocks, DedupConfig config) {
        Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
        return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, Iterable<MapDocument>>, String, String>) it -> {
            final SparkReporter reporter = new SparkReporter(accumulators);
            new BlockProcessor(config).process(it._1(), it._2(), reporter);
            return reporter.getRelations().iterator();
        }).mapToPair(
                (PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
                        new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
                .reduceByKey((a, b) -> a)
                .mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
    }
    /**
     * @return the list of blocks based on clustering of dedup configuration
     * @param: the spark context
     * @param: list of entities: <id, entity>
     * @param: the dedup configuration
     */
    public static JavaPairRDD<String, Iterable<MapDocument>> createBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
        return mapDocs
                //the reduce is just to be sure that we haven't document with same id
                .reduceByKey((a, b) -> a)
                .map(Tuple2::_2)
                //Clustering: from <id, doc> to List<groupkey,doc>
                .flatMapToPair((PairFlatMapFunction<MapDocument, String, MapDocument>) a ->
                        DedupUtility.getGroupingKeys(config, a)
                                .stream()
                                .map(it -> new Tuple2<>(it, a))
                                .collect(Collectors.toList())
                                .iterator())
                .groupByKey();
    }
    public static JavaPairRDD<String, List<MapDocument>> createsortedBlocks(JavaSparkContext context, JavaPairRDD<String, MapDocument> mapDocs, DedupConfig config) {
        final String of = config.getWf().getOrderField();
        final int maxQueueSize = config.getWf().getGroupMaxSize();
        return mapDocs
                //the reduce is just to be sure that we haven't document with same id
                .reduceByKey((a, b) -> a)
                .map(Tuple2::_2)
                //Clustering: from <id, doc> to List<groupkey,doc>
                .flatMapToPair((PairFlatMapFunction<MapDocument, String, List<MapDocument>>) a ->
                        DedupUtility.getGroupingKeys(config, a)
                                .stream()
                                .map(it -> {
                                            List<MapDocument> tmp = new ArrayList<>();
                                            tmp.add(a);
                                            return new Tuple2<>(it, tmp);
                                        }
                                )
                                .collect(Collectors.toList())
                                .iterator())
                .reduceByKey((Function2<List<MapDocument>, List<MapDocument>, List<MapDocument>>) (v1, v2) -> {
                    v1.addAll(v2);
                    v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue()));
                    if (v1.size() > maxQueueSize)
                        return new ArrayList<>(v1.subList(0, maxQueueSize));
                    return v1;
                });
    }
    /**
     * @return the list of vertexes: <id, mapDocument>
     * @param: the spark context
     * @param: list of JSON entities
     * @param: the dedup configuration
     */
    public static JavaPairRDD<String, MapDocument> mapToVertexes(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config) {
        return entities.mapToPair((PairFunction<String, String, MapDocument>) s -> {
            MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s);
            return new Tuple2<String, MapDocument>(mapDocument.getIdentifier(), mapDocument);
        });
    }
    public static JavaPairRDD<String, String> computeRelations2(JavaSparkContext context, JavaPairRDD<String, List<MapDocument>> blocks, DedupConfig config) {
        Map<String, LongAccumulator> accumulators = DedupUtility.constructAccumulator(config, context.sc());
        return blocks.flatMapToPair((PairFlatMapFunction<Tuple2<String, List<MapDocument>>, String, String>) it -> {
            try {
                final SparkReporter reporter = new SparkReporter(accumulators);
                new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter);
                return reporter.getRelations().iterator();
            } catch (Exception e) {
                throw new RuntimeException(it._2().get(0).getIdentifier(), e);
            }
        }).mapToPair(
                (PairFunction<Tuple2<String, String>, String, Tuple2<String, String>>) item ->
                        new Tuple2<String, Tuple2<String, String>>(item._1() + item._2(), item))
                .reduceByKey((a, b) -> a)
                .mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java
@ -0,0 +1,15 @@
 package eu.dnetlib.dedup;
 public enum OafEntityType {
    datasource,
    organization,
    project,
    dataset,
    otherresearchproduct,
    software,
    publication
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java
@ -0,0 +1,79 @@
 package eu.dnetlib.dedup;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import eu.dnetlib.dedup.graph.ConnectedComponent;
 import eu.dnetlib.dedup.graph.GraphProcessor;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.graphx.Edge;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import scala.Tuple2;
 import java.util.ArrayList;
 import java.util.List;
 public class SparkCreateConnectedComponent {
    public static void main(String[] args) throws Exception {
        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
                .appName(SparkCreateConnectedComponent.class.getSimpleName())
                .master(parser.get("master"))
                .getOrCreate();
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
        final String inputPath = parser.get("sourcePath");
        final String entity = parser.get("entity");
        final String targetPath = parser.get("targetPath");
 //        final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json")));
        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
        final JavaPairRDD<Object, String> vertexes = sc.textFile(inputPath + "/" + entity)
                .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
                .mapToPair((PairFunction<String, Object, String>)
                        s -> new Tuple2<Object, String>(getHashcode(s), s)
                );
        final Dataset<Relation> similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class));
        final RDD<Edge<String>> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd();
        final JavaRDD<ConnectedComponent> cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD();
        final Dataset<Relation> mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction<ConnectedComponent, Relation>) c ->
                c.getDocIds()
                        .stream()
                        .flatMap(id -> {
                            List<Relation> tmp = new ArrayList<>();
                            Relation r = new Relation();
                            r.setSource(c.getCcId());
                            r.setTarget(id);
                            r.setRelClass("merges");
                            tmp.add(r);
                            r = new Relation();
                            r.setTarget(c.getCcId());
                            r.setSource(id);
                            r.setRelClass("isMergedIn");
                            tmp.add(r);
                            return tmp.stream();
                        }).iterator()).rdd(), Encoders.bean(Relation.class));
        mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity));
    }
    public  static long getHashcode(final String id) {
        return Hashing.murmur3_128().hashUnencodedChars(id).asLong();
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java
@ -0,0 +1,34 @@
 package eu.dnetlib.dedup;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import eu.dnetlib.pace.config.DedupConfig;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.sql.SparkSession;
 public class SparkCreateDedupRecord {
    public static void main(String[] args) throws Exception {
        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
                .appName(SparkCreateDedupRecord.class.getSimpleName())
                .master(parser.get("master"))
                .getOrCreate();
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
        final String sourcePath = parser.get("sourcePath");
        final String entity = parser.get("entity");
        final String dedupPath = parser.get("dedupPath");
        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
        final JavaRDD<OafEntity> dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf);
        dedupRecord.map(r-> {
            ObjectMapper mapper = new ObjectMapper();
            return mapper.writeValueAsString(r);
        }).saveAsTextFile(dedupPath+"/"+entity+"/dedup_records");
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java
@ -0,0 +1,73 @@
 package eu.dnetlib.dedup;
 import com.google.common.hash.Hashing;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.pace.config.DedupConfig;
 import eu.dnetlib.pace.model.MapDocument;
 import eu.dnetlib.pace.util.MapDocumentUtil;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import scala.Tuple2;
 import java.util.List;
 /**
 * This Spark class creates similarity relations between entities, saving result
 *
 * param request:
 *  sourcePath
 *  entityType
 *  target Path
 */
 public class SparkCreateSimRels {
    public static void main(String[] args) throws Exception {
        final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json")));
        parser.parseArgument(args);
        final SparkSession spark = SparkSession
                .builder()
                .appName(SparkCreateSimRels.class.getSimpleName())
                .master(parser.get("master"))
                .getOrCreate();
        final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
        final String inputPath = parser.get("sourcePath");
        final String entity = parser.get("entity");
        final String targetPath = parser.get("targetPath");
 //        final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
        final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf"));
        JavaPairRDD<String, MapDocument> mapDocument = sc.textFile(inputPath + "/" + entity)
                .mapToPair(s->{
                    MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s);
                    return new Tuple2<>(d.getIdentifier(), d);});
        //create blocks for deduplication
        JavaPairRDD<String, List<MapDocument>> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf);
 //        JavaPairRDD<String, Iterable<MapDocument>> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf);
        //create relations by comparing only elements in the same group
        final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf);
 //        final JavaPairRDD<String,String> dedupRels = Deduper.computeRelations(sc, blocks, dedupConf);
        final JavaRDD<Relation> isSimilarToRDD = dedupRels.map(simRel -> {
            final Relation r = new Relation();
            r.setSource(simRel._1());
            r.setTarget(simRel._2());
            r.setRelClass("isSimilarTo");
            return r;
        });
        spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity));
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java
@ -0,0 +1,47 @@
 package eu.dnetlib.dedup;
 import eu.dnetlib.pace.util.Reporter;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.spark.util.LongAccumulator;
 import scala.Serializable;
 import scala.Tuple2;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 public class SparkReporter implements Serializable, Reporter {
    final List<Tuple2<String, String>> relations = new ArrayList<>();
    private static final Log log = LogFactory.getLog(SparkReporter.class);
    Map<String, LongAccumulator> accumulators;
    public SparkReporter(Map<String, LongAccumulator> accumulators){
        this.accumulators = accumulators;
    }
    public void incrementCounter(String counterGroup, String counterName, long delta, Map<String, LongAccumulator> accumulators) {
        final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
        if (accumulators.containsKey(accumulatorName)){
            accumulators.get(accumulatorName).add(delta);
        }
    }
    @Override
    public void incrementCounter(String counterGroup, String counterName, long delta) {
        incrementCounter(counterGroup, counterName, delta, accumulators);
    }
    @Override
    public void emit(String type, String from, String to) {
        relations.add(new Tuple2<>(from, to));
    }
    public List<Tuple2<String, String>> getRelations() {
        return relations;
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java
@ -0,0 +1,80 @@
 package eu.dnetlib.dedup.graph;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dedup.DedupUtility;
 import eu.dnetlib.pace.util.PaceException;
 import org.apache.commons.lang.StringUtils;
 import org.codehaus.jackson.annotate.JsonIgnore;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.Set;
 public class ConnectedComponent implements Serializable {
    private Set<String> docIds;
    private String ccId;
    public ConnectedComponent() {
    }
    public ConnectedComponent(Set<String> docIds) {
        this.docIds = docIds;
        createID();
    }
    public String createID() {
        if (docIds.size() > 1) {
            final String s = getMin();
            String prefix = s.split("\\|")[0];
            ccId =prefix + "|dedup_______::" + DedupUtility.md5(s);
            return ccId;
        } else {
            return docIds.iterator().next();
        }
    }
    @JsonIgnore
    public String getMin(){
        final StringBuilder min = new StringBuilder();
        docIds.forEach(i -> {
            if (StringUtils.isBlank(min.toString())) {
                min.append(i);
            } else {
                if (min.toString().compareTo(i) > 0) {
                    min.setLength(0);
                    min.append(i);
                }
            }
        });
        return min.toString();
    }
    @Override
    public String toString(){
        ObjectMapper mapper = new ObjectMapper();
        try {
            return mapper.writeValueAsString(this);
        } catch (IOException e) {
            throw new PaceException("Failed to create Json: ", e);
        }
    }
    public Set<String> getDocIds() {
        return docIds;
    }
    public void setDocIds(Set<String> docIds) {
        this.docIds = docIds;
    }
    public String getCcId() {
        return ccId;
    }
    public void setCcId(String ccId) {
        this.ccId = ccId;
    }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala
@ -0,0 +1,37 @@
 package eu.dnetlib.dedup.graph
 import org.apache.spark.graphx._
 import org.apache.spark.rdd.RDD
 import scala.collection.JavaConversions;
 object GraphProcessor {
  def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
    val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
    val cc = graph.connectedComponents(maxIterations).vertices
    val joinResult = vertexes.leftOuterJoin(cc).map {
      case (id, (openaireId, cc)) => {
        if (cc.isEmpty) {
          (id, openaireId)
        }
        else {
          (cc.get, openaireId)
        }
      }
    }
    val connectedComponents = joinResult.groupByKey()
      .map[ConnectedComponent](cc => asConnectedComponent(cc))
    connectedComponents
  }
  def asConnectedComponent(group: (VertexId, Iterable[String])): ConnectedComponent = {
    val docs = group._2.toSet[String]
    val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs));
    connectedComponent
  }
 }
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/config-default.xml
@ -0,0 +1,18 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>oozie.action.sharelib.for.spark</name>
        <value>spark2</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/resources/eu/dnetlib/dhp/sx/dedup/oozie_app/workflow.xml
@ -25,10 +25,8 @@
            <description>memory for individual executor</description>
        </property>
    </parameters>
    <start to="DeleteWorkingPath"/>
    <kill name="Kill">
        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
    </kill>
@ -163,33 +161,6 @@
        <ok to="replaceEntity"/>
        <error to="Kill"/>
    </action>
 <!--    <action name="updateDeletedByInferenceRelation">-->
 <!--        <spark xmlns="uri:oozie:spark-action:0.2">-->
 <!--            <job-tracker>${jobTracker}</job-tracker>-->
 <!--            <name-node>${nameNode}</name-node>-->
 <!--            <master>yarn-cluster</master>-->
 <!--            <mode>cluster</mode>-->
 <!--            <name>Update ${entity} set deleted by Inference</name>-->
 <!--            <class>eu.dnetlib.dedup.SparkUpdateEntityJob</class>-->
 <!--            <jar>dhp-dedup-${projectVersion}.jar</jar>-->
 <!--            <spark-opts>-->
 <!--                &#45;&#45;executor-memory ${sparkExecutorMemory}-->
 <!--                &#45;&#45;driver-memory=${sparkDriverMemory}-->
 <!--                ${sparkExtraOPT}-->
 <!--            </spark-opts>-->
 <!--            <arg>-mt</arg><arg>yarn-cluster</arg>-->
 <!--            <arg>&#45;&#45;entityPath</arg><arg>${targetPath}/${entity}/relation_propagated</arg>-->
 <!--            <arg>&#45;&#45;mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>-->
 <!--            <arg>&#45;&#45;entity</arg><arg>relation</arg>-->
 <!--            <arg>&#45;&#45;dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>-->
 <!--            <arg>&#45;&#45;targetPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>-->
 <!--        </spark>-->
 <!--        <ok to="End"/>-->
 <!--        <error to="Kill"/>-->
 <!--    </action>-->
    <action name="replaceEntity">
        <fs>
            <delete path='${sourcePath}/${entity}'/>
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
 import com.mongodb.DBObject;
 import com.mongodb.MongoClient;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
 import com.jayway.jsonpath.JsonPath;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
 import eu.dnetlib.dhp.schema.oaf.Relation;
 import eu.dnetlib.dhp.utils.DHPUtils;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
 import com.fasterxml.jackson.databind.DeserializationFeature;
 import com.fasterxml.jackson.databind.ObjectMapper;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java
@ -1,9 +1,9 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-import eu.dnetlib.dhp.graph.sx.parser.DatasetScholexplorerParser;
+import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
-import eu.dnetlib.dhp.graph.sx.parser.PublicationScholexplorerParser;
+import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.scholexplorer.relation.RelationMapper;
 import org.apache.commons.io.IOUtils;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx.parser;
+package eu.dnetlib.dhp.sx.graph.parser;
 import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx.parser;
+package eu.dnetlib.dhp.sx.graph.parser;
 import com.ximpleware.AutoPilot;
 import com.ximpleware.VTDGen;
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx.parser;
+package eu.dnetlib.dhp.sx.graph.parser;
 import com.ximpleware.AutoPilot;
 import com.ximpleware.VTDGen;
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step1/oozie_app/workflow.xml
@ -55,7 +55,7 @@
        <java>
            <job-tracker>${jobTracker}</job-tracker>
            <name-node>${nameNode}</name-node>
-            <main-class>eu.dnetlib.dhp.graph.sx.ImportDataFromMongo</main-class>
+            <main-class>eu.dnetlib.dhp.sx.graph.ImportDataFromMongo</main-class>
            <arg>-t</arg><arg>${targetPath}</arg>
            <arg>-n</arg><arg>${nameNode}</arg>
            <arg>-u</arg><arg>${user}</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step2/oozie_app/workflow.xml
@ -54,7 +54,7 @@
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Extract ${entities}</name>
-            <class>eu.dnetlib.dhp.graph.sx.SparkExtractEntitiesJob</class>
+            <class>eu.dnetlib.dhp.sx.graph.SparkExtractEntitiesJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory ${sparkExecutorMemory}
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/step3/oozie_app/workflow.xml
@ -45,7 +45,7 @@
            <master>yarn-cluster</master>
            <mode>cluster</mode>
            <name>Merge ${entity}</name>
-            <class>eu.dnetlib.dhp.graph.sx.SparkScholexplorerCreateRawGraphJob</class>
+            <class>eu.dnetlib.dhp.sx.graph.SparkScholexplorerCreateRawGraphJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>  --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
            <arg>-mt</arg> <arg>yarn-cluster</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java
@ -1,9 +1,9 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.SerializationFeature;
-import eu.dnetlib.dhp.graph.sx.parser.DatasetScholexplorerParser;
+import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
 import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.scholexplorer.relation.RelationMapper;
 import org.apache.commons.io.IOUtils;
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java
@ -1,4 +1,4 @@
-package eu.dnetlib.dhp.graph.sx;
+package eu.dnetlib.dhp.sx.graph;
`@ -1,4 +1,4 @@`
	`package eu.dnetlib.dhp.graph.sx.parser;`	`package eu.dnetlib.dhp.sx.graph.parser;`


	`import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;`	`import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;`
`@ -1,4 +1,4 @@`
	`package eu.dnetlib.dhp.graph.sx;`	`package eu.dnetlib.dhp.sx.graph;`