forked from D-Net/dnet-hadoop
fix for issue that duplicated result
This commit is contained in:
parent
8f6ce970f9
commit
b258f99ece
|
@ -77,9 +77,15 @@ public class PrepareDatasourceCountryAssociation {
|
|||
List<String> allowedtypes,
|
||||
String inputPath,
|
||||
String outputPath) {
|
||||
String whitelisted = "";
|
||||
for (String i : whitelist) {
|
||||
whitelisted += " OR id = '" + i + "'";
|
||||
String whitelisted = " d.id = '" + whitelist.get(0) + "'";
|
||||
for (int i = 1; i < whitelist.size(); i++) {
|
||||
whitelisted += " OR d.id = '" + whitelist.get(i) + "'";
|
||||
}
|
||||
|
||||
String allowed = "d.datasourcetype.classid = '" + allowedtypes.get(0) + "'";
|
||||
|
||||
for (int i = 1; i < allowedtypes.size(); i++) {
|
||||
allowed += " OR d.datasourcetype.classid = '" + allowedtypes.get(i) + "'";
|
||||
}
|
||||
|
||||
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||
|
@ -90,26 +96,39 @@ public class PrepareDatasourceCountryAssociation {
|
|||
relation.createOrReplaceTempView("relation");
|
||||
organization.createOrReplaceTempView("organization");
|
||||
|
||||
String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
||||
+ "FROM ( SELECT id "
|
||||
+ " FROM datasource "
|
||||
+ " WHERE (datainfo.deletedbyinference = false "
|
||||
+ whitelisted
|
||||
+ ") "
|
||||
+ getConstraintList("datasourcetype.classid = '", allowedtypes)
|
||||
+ ") d "
|
||||
+ "JOIN ( SELECT source, target "
|
||||
+ " FROM relation "
|
||||
+ " WHERE relclass = '"
|
||||
+ ModelConstants.IS_PROVIDED_BY
|
||||
+ "' "
|
||||
+ " AND datainfo.deletedbyinference = false ) rel "
|
||||
+ "ON d.id = rel.source "
|
||||
+ "JOIN (SELECT id, country "
|
||||
+ " FROM organization "
|
||||
+ " WHERE datainfo.deletedbyinference = false "
|
||||
+ " AND length(country.classid) > 0) o "
|
||||
+ "ON o.id = rel.target";
|
||||
// String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country "
|
||||
// + "FROM ( SELECT id "
|
||||
// + " FROM datasource "
|
||||
// + " WHERE (datainfo.deletedbyinference = false "
|
||||
// + whitelisted
|
||||
// + ") "
|
||||
// + getConstraintList("datasourcetype.classid = '", allowedtypes)
|
||||
// + ") d "
|
||||
// + "JOIN ( SELECT source, target "
|
||||
// + " FROM relation "
|
||||
// + " WHERE relclass = '"
|
||||
// + ModelConstants.IS_PROVIDED_BY
|
||||
// + "' "
|
||||
// + " AND datainfo.deletedbyinference = false ) rel "
|
||||
// + "ON d.id = rel.source "
|
||||
// + "JOIN (SELECT id, country "
|
||||
// + " FROM organization "
|
||||
// + " WHERE datainfo.deletedbyinference = false "
|
||||
// + " AND length(country.classid) > 0) o "
|
||||
// + "ON o.id = rel.target";
|
||||
|
||||
String query = "SELECT source dataSourceId, " +
|
||||
"named_struct('classid', country.classid, 'classname', country.classname) country " +
|
||||
"FROM datasource d " +
|
||||
"JOIN relation rel " +
|
||||
"ON d.id = rel.source " +
|
||||
"JOIN organization o " +
|
||||
"ON o.id = rel.target " +
|
||||
"WHERE rel.datainfo.deletedbyinference = false " +
|
||||
"and rel.relclass = '" + ModelConstants.IS_PROVIDED_BY + "'" +
|
||||
"and o.datainfo.deletedbyinference = false " +
|
||||
"and length(o.country.classid) > 0 " +
|
||||
"and (" + allowed + " or " + whitelisted + ")";
|
||||
|
||||
spark
|
||||
.sql(query)
|
||||
|
|
|
@ -4,7 +4,12 @@ package eu.dnetlib.dhp.countrypropagation;
|
|||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -13,6 +18,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class PrepareResultCountrySet {
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultCountrySet.class);
|
||||
|
@ -60,6 +66,7 @@ public class PrepareResultCountrySet {
|
|||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
getPotentialResultToUpdate(
|
||||
spark,
|
||||
inputPath,
|
||||
|
@ -89,10 +96,33 @@ public class PrepareResultCountrySet {
|
|||
spark
|
||||
.sql(RESULT_COUNTRYSET_QUERY)
|
||||
.as(Encoders.bean(ResultCountrySet.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Append)
|
||||
.json(outputPath);
|
||||
.toJavaRDD()
|
||||
.mapToPair(value -> new Tuple2<>(value.getResultId(), value))
|
||||
.reduceByKey((a, b) -> {
|
||||
ArrayList<CountrySbs> countryList = a.getCountrySet();
|
||||
Set<String> countryCodes = countryList
|
||||
.stream()
|
||||
.map(country -> country.getClassid())
|
||||
.collect(Collectors.toSet());
|
||||
b
|
||||
.getCountrySet()
|
||||
.stream()
|
||||
.forEach(c -> {
|
||||
if (!countryCodes.contains(c.getClassid())) {
|
||||
countryList.add(c);
|
||||
countryCodes.add(c.getClassid());
|
||||
}
|
||||
|
||||
});
|
||||
a.setCountrySet(countryList);
|
||||
return a;
|
||||
})
|
||||
.map(couple -> OBJECT_MAPPER.writeValueAsString(couple._2()))
|
||||
.saveAsTextFile(outputPath, GzipCodec.class);
|
||||
// .write()
|
||||
// .option("compression", "gzip")
|
||||
// .mode(SaveMode.Append)
|
||||
// .json(outputPath);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue