[graph cleaning] patch also the result's collectedfrom and hostedby datasource name according to the datasource master-duplicate mapping

This commit is contained in:
Claudio Atzori 2022-11-28 10:18:43 +01:00
parent 6082d235d3
commit 11695ba649
4 changed files with 55 additions and 29 deletions

View File

@ -8,7 +8,6 @@ import java.nio.charset.StandardCharsets;
import java.sql.ResultSet; import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@ -28,7 +27,9 @@ public class ReadDatasourceMasterDuplicateFromDB {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT id as master, duplicate FROM dsm_dedup_services;"; private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId "
+
"FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
throws IOException { throws IOException {
@ -52,11 +53,15 @@ public class ReadDatasourceMasterDuplicateFromDB {
private static MasterDuplicate datasourceMasterMap(ResultSet rs) { private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
try { try {
MasterDuplicate md = new MasterDuplicate(); final MasterDuplicate md = new MasterDuplicate();
final String master = rs.getString("master");
final String duplicate = rs.getString("duplicate"); final String duplicateId = rs.getString("duplicateId");
md.setMaster(OafMapperUtils.createOpenaireId(10, master, true)); final String masterId = rs.getString("masterId");
md.setDuplicate(OafMapperUtils.createOpenaireId(10, duplicate, true)); final String masterName = rs.getString("masterName");
md.setDuplicateId(OafMapperUtils.createOpenaireId(10, duplicateId, true));
md.setMasterId(OafMapperUtils.createOpenaireId(10, masterId, true));
md.setMasterName(masterName);
return md; return md;
} catch (final SQLException e) { } catch (final SQLException e) {

View File

@ -8,22 +8,31 @@ import java.io.Serializable;
* @Date 21/07/22 * @Date 21/07/22
*/ */
public class MasterDuplicate implements Serializable { public class MasterDuplicate implements Serializable {
private String duplicate; private String duplicateId;
private String master; private String masterId;
private String masterName;
public String getDuplicate() { public String getDuplicateId() {
return duplicate; return duplicateId;
} }
public void setDuplicate(String duplicate) { public void setDuplicateId(String duplicateId) {
this.duplicate = duplicate; this.duplicateId = duplicateId;
} }
public String getMaster() { public String getMasterId() {
return master; return masterId;
} }
public void setMaster(String master) { public void setMasterId(String masterId) {
this.master = master; this.masterId = masterId;
}
public String getMasterName() {
return masterName;
}
public void setMasterName(String masterName) {
this.masterName = masterName;
} }
} }

View File

@ -78,6 +78,7 @@ public class CleanCfHbSparkJob {
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz, private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
String workingPath, String masterDuplicatePath, String outputPath) { String workingPath, String masterDuplicatePath, String outputPath) {
// read the master-duplicate tuples // read the master-duplicate tuples
Dataset<MasterDuplicate> md = spark Dataset<MasterDuplicate> md = spark
.read() .read()
@ -111,7 +112,7 @@ public class CleanCfHbSparkJob {
resolved resolved
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicate"))) .joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicate")))
.map((MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping>) t -> { .map((MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping>) t -> {
t._1().setMaster(t._2().getMaster()); t._1().setMasterId(t._2().getMasterId());
return t._1(); return t._1();
}, Encoders.bean(IdCfHbMapping.class)) }, Encoders.bean(IdCfHbMapping.class))
.write() .write()
@ -154,13 +155,13 @@ public class CleanCfHbSparkJob {
@Override @Override
public T reduce(T r, IdCfHbMapping a) { public T reduce(T r, IdCfHbMapping a) {
if (Objects.isNull(a) && StringUtils.isBlank(a.getMaster())) { if (Objects.isNull(a) && StringUtils.isBlank(a.getMasterId())) {
return r; return r;
} }
r.getCollectedfrom().forEach(kv -> updateKey(kv, a)); r.getCollectedfrom().forEach(kv -> updateKeyValue(kv, a));
r.getInstance().forEach(i -> { r.getInstance().forEach(i -> {
updateKey(i.getHostedby(), a); updateKeyValue(i.getHostedby(), a);
updateKey(i.getCollectedfrom(), a); updateKeyValue(i.getCollectedfrom(), a);
}); });
return r; return r;
} }
@ -178,9 +179,10 @@ public class CleanCfHbSparkJob {
return r; return r;
} }
private void updateKey(final KeyValue kv, final IdCfHbMapping a) { private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
if (kv.getKey().equals(a.getCfhb())) { if (kv.getKey().equals(a.getCfhb())) {
kv.setKey(a.getMaster()); kv.setKey(a.getMasterId());
kv.setValue(a.getMasterName());
} }
} }

View File

@ -9,7 +9,9 @@ public class IdCfHbMapping implements Serializable {
private String cfhb; private String cfhb;
private String master; private String masterId;
private String masterName;
public IdCfHbMapping() { public IdCfHbMapping() {
} }
@ -34,11 +36,19 @@ public class IdCfHbMapping implements Serializable {
this.cfhb = cfhb; this.cfhb = cfhb;
} }
public String getMaster() { public String getMasterId() {
return master; return masterId;
} }
public void setMaster(String master) { public void setMasterId(String masterId) {
this.master = master; this.masterId = masterId;
}
public String getMasterName() {
return masterName;
}
public void setMasterName(String masterName) {
this.masterName = masterName;
} }
} }