forked from D-Net/dnet-hadoop
[graph cleaning] patch also the result's collectedfrom and hostedby datasource name according to the datasource master-duplicate mapping
This commit is contained in:
parent
6082d235d3
commit
11695ba649
|
@ -8,7 +8,6 @@ import java.nio.charset.StandardCharsets;
|
||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
@ -28,7 +27,9 @@ public class ReadDatasourceMasterDuplicateFromDB {
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
private static final String QUERY = "SELECT id as master, duplicate FROM dsm_dedup_services;";
|
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId "
|
||||||
|
+
|
||||||
|
"FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
|
||||||
|
|
||||||
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
|
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
@ -52,11 +53,15 @@ public class ReadDatasourceMasterDuplicateFromDB {
|
||||||
|
|
||||||
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
|
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
|
||||||
try {
|
try {
|
||||||
MasterDuplicate md = new MasterDuplicate();
|
final MasterDuplicate md = new MasterDuplicate();
|
||||||
final String master = rs.getString("master");
|
|
||||||
final String duplicate = rs.getString("duplicate");
|
final String duplicateId = rs.getString("duplicateId");
|
||||||
md.setMaster(OafMapperUtils.createOpenaireId(10, master, true));
|
final String masterId = rs.getString("masterId");
|
||||||
md.setDuplicate(OafMapperUtils.createOpenaireId(10, duplicate, true));
|
final String masterName = rs.getString("masterName");
|
||||||
|
|
||||||
|
md.setDuplicateId(OafMapperUtils.createOpenaireId(10, duplicateId, true));
|
||||||
|
md.setMasterId(OafMapperUtils.createOpenaireId(10, masterId, true));
|
||||||
|
md.setMasterName(masterName);
|
||||||
|
|
||||||
return md;
|
return md;
|
||||||
} catch (final SQLException e) {
|
} catch (final SQLException e) {
|
||||||
|
|
|
@ -8,22 +8,31 @@ import java.io.Serializable;
|
||||||
* @Date 21/07/22
|
* @Date 21/07/22
|
||||||
*/
|
*/
|
||||||
public class MasterDuplicate implements Serializable {
|
public class MasterDuplicate implements Serializable {
|
||||||
private String duplicate;
|
private String duplicateId;
|
||||||
private String master;
|
private String masterId;
|
||||||
|
private String masterName;
|
||||||
|
|
||||||
public String getDuplicate() {
|
public String getDuplicateId() {
|
||||||
return duplicate;
|
return duplicateId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDuplicate(String duplicate) {
|
public void setDuplicateId(String duplicateId) {
|
||||||
this.duplicate = duplicate;
|
this.duplicateId = duplicateId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getMaster() {
|
public String getMasterId() {
|
||||||
return master;
|
return masterId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setMaster(String master) {
|
public void setMasterId(String masterId) {
|
||||||
this.master = master;
|
this.masterId = masterId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getMasterName() {
|
||||||
|
return masterName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMasterName(String masterName) {
|
||||||
|
this.masterName = masterName;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -78,6 +78,7 @@ public class CleanCfHbSparkJob {
|
||||||
|
|
||||||
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
|
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
|
||||||
String workingPath, String masterDuplicatePath, String outputPath) {
|
String workingPath, String masterDuplicatePath, String outputPath) {
|
||||||
|
|
||||||
// read the master-duplicate tuples
|
// read the master-duplicate tuples
|
||||||
Dataset<MasterDuplicate> md = spark
|
Dataset<MasterDuplicate> md = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -111,7 +112,7 @@ public class CleanCfHbSparkJob {
|
||||||
resolved
|
resolved
|
||||||
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicate")))
|
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicate")))
|
||||||
.map((MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping>) t -> {
|
.map((MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping>) t -> {
|
||||||
t._1().setMaster(t._2().getMaster());
|
t._1().setMasterId(t._2().getMasterId());
|
||||||
return t._1();
|
return t._1();
|
||||||
}, Encoders.bean(IdCfHbMapping.class))
|
}, Encoders.bean(IdCfHbMapping.class))
|
||||||
.write()
|
.write()
|
||||||
|
@ -154,13 +155,13 @@ public class CleanCfHbSparkJob {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public T reduce(T r, IdCfHbMapping a) {
|
public T reduce(T r, IdCfHbMapping a) {
|
||||||
if (Objects.isNull(a) && StringUtils.isBlank(a.getMaster())) {
|
if (Objects.isNull(a) && StringUtils.isBlank(a.getMasterId())) {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
r.getCollectedfrom().forEach(kv -> updateKey(kv, a));
|
r.getCollectedfrom().forEach(kv -> updateKeyValue(kv, a));
|
||||||
r.getInstance().forEach(i -> {
|
r.getInstance().forEach(i -> {
|
||||||
updateKey(i.getHostedby(), a);
|
updateKeyValue(i.getHostedby(), a);
|
||||||
updateKey(i.getCollectedfrom(), a);
|
updateKeyValue(i.getCollectedfrom(), a);
|
||||||
});
|
});
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
@ -178,9 +179,10 @@ public class CleanCfHbSparkJob {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updateKey(final KeyValue kv, final IdCfHbMapping a) {
|
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
|
||||||
if (kv.getKey().equals(a.getCfhb())) {
|
if (kv.getKey().equals(a.getCfhb())) {
|
||||||
kv.setKey(a.getMaster());
|
kv.setKey(a.getMasterId());
|
||||||
|
kv.setValue(a.getMasterName());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,9 @@ public class IdCfHbMapping implements Serializable {
|
||||||
|
|
||||||
private String cfhb;
|
private String cfhb;
|
||||||
|
|
||||||
private String master;
|
private String masterId;
|
||||||
|
|
||||||
|
private String masterName;
|
||||||
|
|
||||||
public IdCfHbMapping() {
|
public IdCfHbMapping() {
|
||||||
}
|
}
|
||||||
|
@ -34,11 +36,19 @@ public class IdCfHbMapping implements Serializable {
|
||||||
this.cfhb = cfhb;
|
this.cfhb = cfhb;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getMaster() {
|
public String getMasterId() {
|
||||||
return master;
|
return masterId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setMaster(String master) {
|
public void setMasterId(String masterId) {
|
||||||
this.master = master;
|
this.masterId = masterId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getMasterName() {
|
||||||
|
return masterName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMasterName(String masterName) {
|
||||||
|
this.masterName = masterName;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue