refactoring

This commit is contained in:
Miriam Baglioni 2023-06-30 19:05:15 +02:00
parent 55ea485783
commit 4c9bc4c3a5
3 changed files with 520 additions and 482 deletions

View File

@ -82,14 +82,16 @@ public class SparkAtomicActionUsageJob implements Serializable {
}); });
} }
private static void prepareResultData(String dbname, SparkSession spark, String workingPath, String tableName, String resultAttributeName, String datasourceAttributeName) { private static void prepareResultData(String dbname, SparkSession spark, String workingPath, String tableName,
String resultAttributeName, String datasourceAttributeName) {
spark spark
.sql( .sql(
String String
.format( .format(
"select %s as id, %s as datasourceId, sum(downloads) as downloads, sum(views) as views " + "select %s as id, %s as datasourceId, sum(downloads) as downloads, sum(views) as views " +
"from %s.%s group by %s, %s", "from %s.%s group by %s, %s",
resultAttributeName, datasourceAttributeName, dbname, tableName, resultAttributeName, datasourceAttributeName)) resultAttributeName, datasourceAttributeName, dbname, tableName, resultAttributeName,
datasourceAttributeName))
.as(Encoders.bean(UsageStatsResultModel.class)) .as(Encoders.bean(UsageStatsResultModel.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
@ -131,20 +133,22 @@ public class SparkAtomicActionUsageJob implements Serializable {
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
} }
public static Measure newMeasureInstance(String id) { public static Measure newMeasureInstance(String id) {
Measure m = new Measure(); Measure m = new Measure();
m.setId(id); m.setId(id);
m.setUnit(new ArrayList<>()); m.setUnit(new ArrayList<>());
return m; return m;
} }
private static Dataset<Result> getFinalIndicatorsResult(SparkSession spark, String inputPath) { private static Dataset<Result> getFinalIndicatorsResult(SparkSession spark, String inputPath) {
return readPath(spark, inputPath, UsageStatsResultModel.class) return readPath(spark, inputPath, UsageStatsResultModel.class)
.groupByKey((MapFunction<UsageStatsResultModel, String>) usm -> usm.getId(), Encoders.STRING()) .groupByKey((MapFunction<UsageStatsResultModel, String>) usm -> usm.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, UsageStatsResultModel, Result>) (k,it) -> { .mapGroups((MapGroupsFunction<String, UsageStatsResultModel, Result>) (k, it) -> {
Result r = new Result(); Result r = new Result();
r.setId("50|" + k); r.setId("50|" + k);
//id = download or view and unit = list of key value pairs // id = download or view and unit = list of key value pairs
Measure download = newMeasureInstance("downloads"); Measure download = newMeasureInstance("downloads");
Measure view = newMeasureInstance("views"); Measure view = newMeasureInstance("views");
UsageStatsResultModel first = it.next(); UsageStatsResultModel first = it.next();
@ -178,8 +182,14 @@ public class SparkAtomicActionUsageJob implements Serializable {
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS), ModelConstants.DNET_PROVENANCE_ACTIONS),
""); "");
download.getUnit().add(OafMapperUtils.newKeyValueInstance(usm.getDatasourceId(), String.valueOf(usm.getDownloads()), dataInfo)); download
view.getUnit().add(OafMapperUtils.newKeyValueInstance(usm.getDatasourceId(), String.valueOf(usm.getViews()), dataInfo)); .getUnit()
.add(
OafMapperUtils
.newKeyValueInstance(usm.getDatasourceId(), String.valueOf(usm.getDownloads()), dataInfo));
view
.getUnit()
.add(OafMapperUtils.newKeyValueInstance(usm.getDatasourceId(), String.valueOf(usm.getViews()), dataInfo));
} }
private static Dataset<Project> getFinalIndicatorsProject(SparkSession spark, String inputPath) { private static Dataset<Project> getFinalIndicatorsProject(SparkSession spark, String inputPath) {

View File

@ -1,11 +1,12 @@
package eu.dnetlib.dhp.actionmanager.usagestats; package eu.dnetlib.dhp.actionmanager.usagestats;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
* @Date 30/06/23 * @Date 30/06/23
*/ */
public class UsageStatsResultModel extends UsageStatsModel{ public class UsageStatsResultModel extends UsageStatsModel {
private String datasourceId ; private String datasourceId;
public String getDatasourceId() { public String getDatasourceId() {
return datasourceId; return datasourceId;

View File

@ -8,7 +8,6 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.oaf.Measure;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -25,6 +24,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
@ -144,7 +144,8 @@ public class SparkAtomicActionCountJobTest {
"Inferred by OpenAIRE", "Inferred by OpenAIRE",
u.getDataInfo().getProvenanceaction().getClassname())))); u.getDataInfo().getProvenanceaction().getClassname()))));
tmp.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("40|")) tmp
.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("40|"))
.foreach( .foreach(
r -> ((OafEntity) r.getPayload()) r -> ((OafEntity) r.getPayload())
.getMeasures() .getMeasures()
@ -169,37 +170,60 @@ public class SparkAtomicActionCountJobTest {
.equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6")) .equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
.count()); .count());
OafEntity entity = (OafEntity) tmp.filter(aa -> ((OafEntity) aa.getPayload()).getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6")).first() OafEntity entity = (OafEntity) tmp
.filter(
aa -> ((OafEntity) aa.getPayload()).getId().equals("50|dedup_wf_001::53575dc69e9ace947e02d47ecd54a7a6"))
.first()
.getPayload(); .getPayload();
entity entity
.getMeasures() .getMeasures()
.stream() .stream()
.forEach( .forEach(
m -> Assertions.assertEquals(3, m.getUnit().size() )); m -> Assertions.assertEquals(3, m.getUnit().size()));
Measure downloads = entity.getMeasures() Measure downloads = entity
.getMeasures()
.stream() .stream()
.filter(m -> m.getId().equals("downloads")) .filter(m -> m.getId().equals("downloads"))
.findFirst() .findFirst()
.get(); .get();
Assertions
.assertEquals(
String.valueOf(0),
downloads.getUnit().stream().filter(u -> u.getKey().equals("10|fake1")).findFirst().get().getValue());
Assertions
.assertEquals(
String.valueOf(0),
downloads.getUnit().stream().filter(u -> u.getKey().equals("10|fake2")).findFirst().get().getValue());
Assertions
.assertEquals(
String.valueOf(1),
downloads.getUnit().stream().filter(u -> u.getKey().equals("10|fake3")).findFirst().get().getValue());
Assertions.assertEquals(String.valueOf(0), downloads.getUnit().stream().filter(u -> u.getKey().equals("10|fake1")).findFirst().get().getValue()); Measure views = entity
Assertions.assertEquals(String.valueOf(0), downloads.getUnit().stream().filter(u -> u.getKey().equals("10|fake2")).findFirst().get().getValue()); .getMeasures()
Assertions.assertEquals(String.valueOf(1), downloads.getUnit().stream().filter(u -> u.getKey().equals("10|fake3")).findFirst().get().getValue());
Measure views = entity.getMeasures()
.stream() .stream()
.filter(m -> m.getId().equals("views")) .filter(m -> m.getId().equals("views"))
.findFirst() .findFirst()
.get(); .get();
Assertions.assertEquals(String.valueOf(5), views.getUnit().stream().filter(u -> u.getKey().equals("10|fake1")).findFirst().get().getValue()); Assertions
Assertions.assertEquals(String.valueOf(1), views.getUnit().stream().filter(u -> u.getKey().equals("10|fake2")).findFirst().get().getValue()); .assertEquals(
Assertions.assertEquals(String.valueOf(3), views.getUnit().stream().filter(u -> u.getKey().equals("10|fake3")).findFirst().get().getValue()); String.valueOf(5),
views.getUnit().stream().filter(u -> u.getKey().equals("10|fake1")).findFirst().get().getValue());
Assertions
.assertEquals(
String.valueOf(1),
views.getUnit().stream().filter(u -> u.getKey().equals("10|fake2")).findFirst().get().getValue());
Assertions
.assertEquals(
String.valueOf(3),
views.getUnit().stream().filter(u -> u.getKey().equals("10|fake3")).findFirst().get().getValue());
tmp.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("10|")) tmp
.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("10|"))
.foreach( .foreach(
r -> ((OafEntity) r.getPayload()) r -> ((OafEntity) r.getPayload())
.getMeasures() .getMeasures()
@ -214,7 +238,6 @@ public class SparkAtomicActionCountJobTest {
"count", "count",
u.getKey())))); u.getKey()))));
Assertions Assertions
.assertEquals( .assertEquals(
"0", "0",
@ -413,6 +436,7 @@ public class SparkAtomicActionCountJobTest {
.get(0) .get(0)
.getValue()); .getValue());
} }
@Test @Test
void testMatch() { void testMatch() {
String usageScoresPath = getClass() String usageScoresPath = getClass()
@ -490,7 +514,8 @@ public class SparkAtomicActionCountJobTest {
"Inferred by OpenAIRE", "Inferred by OpenAIRE",
u.getDataInfo().getProvenanceaction().getClassname())))); u.getDataInfo().getProvenanceaction().getClassname()))));
tmp.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("40|")) tmp
.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("40|"))
.foreach( .foreach(
r -> ((OafEntity) r.getPayload()) r -> ((OafEntity) r.getPayload())
.getMeasures() .getMeasures()
@ -505,7 +530,8 @@ public class SparkAtomicActionCountJobTest {
"count", "count",
u.getKey())))); u.getKey()))));
tmp.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("50|")) tmp
.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("50|"))
.foreach( .foreach(
r -> ((OafEntity) r.getPayload()) r -> ((OafEntity) r.getPayload())
.getMeasures() .getMeasures()
@ -520,7 +546,8 @@ public class SparkAtomicActionCountJobTest {
"10|fake1", "10|fake1",
u.getKey())))); u.getKey()))));
tmp.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("10|")) tmp
.filter(aa -> ((OafEntity) aa.getPayload()).getId().startsWith("10|"))
.foreach( .foreach(
r -> ((OafEntity) r.getPayload()) r -> ((OafEntity) r.getPayload())
.getMeasures() .getMeasures()