forked from antonis.lempesis/dnet-hadoop
WIP promote job functions updated
This commit is contained in:
parent
8d9b3c5de2
commit
d0c9b0cdd6
|
@ -3,7 +3,6 @@ package eu.dnetlib.dhp.actionmanager;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.ReduceFunction;
|
import org.apache.spark.api.java.function.ReduceFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -21,20 +20,12 @@ import java.util.function.Function;
|
||||||
public class PromoteActionSetFromHDFSFunctions {
|
public class PromoteActionSetFromHDFSFunctions {
|
||||||
|
|
||||||
public static <T extends Oaf> Dataset<T> joinOafEntityWithActionPayloadAndMerge(Dataset<T> oafDS,
|
public static <T extends Oaf> Dataset<T> joinOafEntityWithActionPayloadAndMerge(Dataset<T> oafDS,
|
||||||
Dataset<String> actionPayloadDS,
|
Dataset<T> actionPayloadOafDS,
|
||||||
SerializableSupplier<Function<T, String>> oafIdFn,
|
SerializableSupplier<Function<T, String>> oafIdFn,
|
||||||
SerializableSupplier<BiFunction<String, Class<T>, T>> actionPayloadToOafFn,
|
|
||||||
SerializableSupplier<BiFunction<T, T, T>> mergeAndGetFn,
|
SerializableSupplier<BiFunction<T, T, T>> mergeAndGetFn,
|
||||||
Class<T> clazz) {
|
Class<T> clazz) {
|
||||||
Dataset<Tuple2<String, T>> oafWithIdDS = oafDS
|
Dataset<Tuple2<String, T>> oafWithIdDS = mapToTupleWithId(oafDS, oafIdFn, clazz);
|
||||||
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(oafIdFn.get().apply(value), value),
|
Dataset<Tuple2<String, T>> actionPayloadWithIdDS = mapToTupleWithId(actionPayloadOafDS, oafIdFn, clazz);
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
|
||||||
|
|
||||||
Dataset<Tuple2<String, T>> actionPayloadWithIdDS = actionPayloadDS
|
|
||||||
.map((MapFunction<String, T>) value -> actionPayloadToOafFn.get().apply(value, clazz), Encoders.kryo(clazz))
|
|
||||||
.filter((FilterFunction<T>) Objects::nonNull)
|
|
||||||
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(oafIdFn.get().apply(value), value),
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
|
||||||
|
|
||||||
return oafWithIdDS
|
return oafWithIdDS
|
||||||
.joinWith(actionPayloadWithIdDS, oafWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), "left_outer")
|
.joinWith(actionPayloadWithIdDS, oafWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), "left_outer")
|
||||||
|
@ -48,6 +39,14 @@ public class PromoteActionSetFromHDFSFunctions {
|
||||||
}, Encoders.kryo(clazz));
|
}, Encoders.kryo(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static <T extends Oaf> Dataset<Tuple2<String, T>> mapToTupleWithId(Dataset<T> ds,
|
||||||
|
SerializableSupplier<Function<T, String>> idFn,
|
||||||
|
Class<T> clazz) {
|
||||||
|
return ds
|
||||||
|
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(idFn.get().apply(value), value),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> Dataset<T> groupOafByIdAndMerge(Dataset<T> oafDS,
|
public static <T extends Oaf> Dataset<T> groupOafByIdAndMerge(Dataset<T> oafDS,
|
||||||
SerializableSupplier<Function<T, String>> oafIdFn,
|
SerializableSupplier<Function<T, String>> oafIdFn,
|
||||||
SerializableSupplier<BiFunction<T, T, T>> mergeAndGetFn,
|
SerializableSupplier<BiFunction<T, T, T>> mergeAndGetFn,
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
package eu.dnetlib.dhp.actionmanager;
|
package eu.dnetlib.dhp.actionmanager;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.JsonNode;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -11,7 +9,6 @@ import org.junit.AfterClass;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
|
@ -50,38 +47,24 @@ public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
);
|
);
|
||||||
Dataset<OafImpl> oafDS = spark.createDataset(oafData, Encoders.bean(OafImpl.class));
|
Dataset<OafImpl> oafDS = spark.createDataset(oafData, Encoders.bean(OafImpl.class));
|
||||||
|
|
||||||
List<String> actionPayloadData = Arrays.asList(
|
List<OafImpl> actionPayloadData = Arrays.asList(
|
||||||
createActionPayload(id1),
|
createOafImpl(id1),
|
||||||
createActionPayload(id2), createActionPayload(id2),
|
createOafImpl(id2), createOafImpl(id2),
|
||||||
createActionPayload(id3), createActionPayload(id3), createActionPayload(id3)
|
createOafImpl(id3), createOafImpl(id3), createOafImpl(id3)
|
||||||
);
|
);
|
||||||
Dataset<String> actionPayloadDS = spark.createDataset(actionPayloadData, Encoders.STRING());
|
Dataset<OafImpl> actionPayloadDS = spark.createDataset(actionPayloadData, Encoders.bean(OafImpl.class));
|
||||||
|
|
||||||
SerializableSupplier<Function<OafImpl, String>> oafIdFn = () -> OafImpl::getId;
|
SerializableSupplier<Function<OafImpl, String>> oafIdFn = () -> OafImpl::getId;
|
||||||
SerializableSupplier<BiFunction<String, Class<OafImpl>, OafImpl>> actionPayloadToOafFn = () -> (s, clazz) -> {
|
SerializableSupplier<BiFunction<OafImpl, OafImpl, OafImpl>> mergeAndGetFn = () -> OafImpl::mergeAngGet;
|
||||||
try {
|
|
||||||
JsonNode jsonNode = new ObjectMapper().readTree(s);
|
|
||||||
String id = jsonNode.at("/id").asText();
|
|
||||||
return createOafImpl(id);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
SerializableSupplier<BiFunction<OafImpl, OafImpl, OafImpl>> mergeAndGetFn = () -> (x, y) -> {
|
|
||||||
x.mergeFrom(y);
|
|
||||||
return x;
|
|
||||||
};
|
|
||||||
|
|
||||||
// when
|
// when
|
||||||
List<OafImpl> results = PromoteActionSetFromHDFSFunctions
|
List<OafImpl> results = PromoteActionSetFromHDFSFunctions
|
||||||
.joinOafEntityWithActionPayloadAndMerge(oafDS,
|
.joinOafEntityWithActionPayloadAndMerge(oafDS,
|
||||||
actionPayloadDS,
|
actionPayloadDS,
|
||||||
oafIdFn,
|
oafIdFn,
|
||||||
actionPayloadToOafFn,
|
|
||||||
mergeAndGetFn,
|
mergeAndGetFn,
|
||||||
OafImpl.class)
|
OafImpl.class)
|
||||||
.collectAsList();
|
.collectAsList();
|
||||||
// System.out.println(results.stream().map(x -> String.format("%s:%d", x.getId(), x.merged)).collect(Collectors.joining(",")));
|
|
||||||
|
|
||||||
// then
|
// then
|
||||||
assertEquals(7, results.size());
|
assertEquals(7, results.size());
|
||||||
|
@ -95,6 +78,8 @@ public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
case "id4":
|
case "id4":
|
||||||
assertEquals(1, result.merged);
|
assertEquals(1, result.merged);
|
||||||
break;
|
break;
|
||||||
|
default:
|
||||||
|
throw new RuntimeException();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -112,10 +97,7 @@ public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
);
|
);
|
||||||
Dataset<OafImpl> oafDS = spark.createDataset(oafData, Encoders.bean(OafImpl.class));
|
Dataset<OafImpl> oafDS = spark.createDataset(oafData, Encoders.bean(OafImpl.class));
|
||||||
SerializableSupplier<Function<OafImpl, String>> idFn = () -> OafImpl::getId;
|
SerializableSupplier<Function<OafImpl, String>> idFn = () -> OafImpl::getId;
|
||||||
SerializableSupplier<BiFunction<OafImpl, OafImpl, OafImpl>> mergeAndGetFn = () -> (x, y) -> {
|
SerializableSupplier<BiFunction<OafImpl, OafImpl, OafImpl>> mergeAndGetFn = () -> OafImpl::mergeAngGet;
|
||||||
x.mergeFrom(y);
|
|
||||||
return x;
|
|
||||||
};
|
|
||||||
|
|
||||||
// when
|
// when
|
||||||
List<OafImpl> results = PromoteActionSetFromHDFSFunctions
|
List<OafImpl> results = PromoteActionSetFromHDFSFunctions
|
||||||
|
@ -124,7 +106,6 @@ public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
mergeAndGetFn,
|
mergeAndGetFn,
|
||||||
OafImpl.class)
|
OafImpl.class)
|
||||||
.collectAsList();
|
.collectAsList();
|
||||||
// System.out.println(results.stream().map(x -> String.format("%s:%d", x.getId(), x.merged)).collect(Collectors.joining(",")));
|
|
||||||
|
|
||||||
// then
|
// then
|
||||||
assertEquals(3, results.size());
|
assertEquals(3, results.size());
|
||||||
|
@ -139,6 +120,8 @@ public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
case "id3":
|
case "id3":
|
||||||
assertEquals(3, result.merged);
|
assertEquals(3, result.merged);
|
||||||
break;
|
break;
|
||||||
|
default:
|
||||||
|
throw new RuntimeException();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -147,8 +130,9 @@ public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
private String id;
|
private String id;
|
||||||
private int merged = 1;
|
private int merged = 1;
|
||||||
|
|
||||||
public void mergeFrom(Oaf e) {
|
public OafImpl mergeAngGet(OafImpl e) {
|
||||||
merged += ((OafImpl) e).merged;
|
merged += e.merged;
|
||||||
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
|
@ -174,7 +158,4 @@ public class PromoteActionSetFromHDFSFunctionsTest {
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String createActionPayload(String id) {
|
|
||||||
return String.format("{\"id\":\"%s\"}", id);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue