dnet-hadoop/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java

150 lines
4.7 KiB
Java
Raw Normal View History

2023-06-26 13:58:11 +02:00
package eu.dnetlib.pace.util;
2023-06-26 13:58:11 +02:00
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.StructType;
2023-06-26 13:58:11 +02:00
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
public class BlockProcessor {
2023-06-26 13:58:11 +02:00
public static final List<String> accumulators = new ArrayList<>();
private static final Log log = LogFactory.getLog(BlockProcessor.class);
private DedupConfig dedupConf;
private final int identifierFieldPos;
private final int orderFieldPos;
public static void constructAccumulator(final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators
.add(
String
.format(
"%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
accumulators
.add(
String
.format(
"%s::%s", dedupConf.getWf().getEntityType(),
String
.format(
"Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(),
dedupConf.getWf().getGroupMaxSize())));
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"));
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
accumulators
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
}
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
this.dedupConf = dedupConf;
this.identifierFieldPos = identifierFieldPos;
this.orderFieldPos = orderFieldPos;
}
public void processSortedRows(final List<Row> documents, final Reporter context) {
2023-06-26 13:58:11 +02:00
if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
2023-06-26 13:58:11 +02:00
processRows(documents, context);
2023-06-26 13:58:11 +02:00
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
private void processRows(final List<Row> queue, final Reporter context) {
for (int pivotPos = 0; pivotPos < queue.size(); pivotPos++) {
final Row pivot = queue.get(pivotPos);
2023-06-26 13:58:11 +02:00
final String idPivot = pivot.getString(identifierFieldPos); // identifier
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
final WfConfig wf = dedupConf.getWf();
2023-06-26 13:58:11 +02:00
if (fieldPivot != null) {
int i = 0;
for (int windowPos = pivotPos + 1; windowPos < queue.size(); windowPos++) {
final Row curr = queue.get(windowPos);
2023-06-26 13:58:11 +02:00
final String idCurr = curr.getString(identifierFieldPos); // identifier
2023-06-26 13:58:11 +02:00
if (mustSkip(idCurr)) {
context.incrementCounter(wf.getEntityType(), "skip list", 1);
break;
}
2023-07-04 18:36:58 +02:00
if (++i > wf.getSlidingWindowSize()) {
2023-06-26 13:58:11 +02:00
break;
}
2023-06-26 13:58:11 +02:00
final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
2023-06-26 13:58:11 +02:00
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
2023-06-26 13:58:11 +02:00
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
2023-06-26 13:58:11 +02:00
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
}
}
}
}
}
2023-06-26 13:58:11 +02:00
public Object getJavaValue(Row row, int pos) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
2023-06-26 13:58:11 +02:00
return null;
}
2023-06-26 13:58:11 +02:00
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
2023-06-26 13:58:11 +02:00
if (result) {
if (idPivot.compareTo(idCurr) <= 0) {
writeSimilarity(context, idPivot, idCurr);
} else {
writeSimilarity(context, idCurr, idPivot);
}
2023-06-26 13:58:11 +02:00
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
}
}
2023-06-26 13:58:11 +02:00
private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}
2023-06-26 13:58:11 +02:00
private String getNsPrefix(final String id) {
return StringUtils.substringBetween(id, "|", "::");
}
2023-06-26 13:58:11 +02:00
private void writeSimilarity(final Reporter context, final String from, final String to) {
final String type = dedupConf.getWf().getEntityType();
2023-06-26 13:58:11 +02:00
context.emit(type, from, to);
}
}