dhp-graph-dump/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/filterentities/ReadDatasourceMasterDuplica...

324 lines
9.8 KiB
Java

package eu.dnetlib.dhp.oa.graph.dump.filterentities;
/**
* @author miriam.baglioni
* @Date 26/03/24
*/
/**
* @author miriam.baglioni
* @Date 26/03/24
*/
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class ReadDatasourceMasterDuplicateFromDB {
private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
public ReadDatasourceMasterDuplicateFromDB() {
}
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
throws IOException {
int count = 0;
DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword);
Throwable var7 = null;
try {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
log
.info(
"running query: {}",
"SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);");
log.info("storing results in: {}", hdfsPath);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
Throwable var12 = null;
try {
dbClient
.processResults(
"SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);",
(rs) -> {
writeMap(datasourceMasterMap(rs), writer);
});
++count;
} catch (Throwable var35) {
var12 = var35;
throw var35;
} finally {
if (writer != null) {
if (var12 != null) {
try {
writer.close();
} catch (Throwable var34) {
var12.addSuppressed(var34);
}
} else {
writer.close();
}
}
}
} catch (Throwable var37) {
var7 = var37;
throw var37;
} finally {
if (dbClient != null) {
if (var7 != null) {
try {
dbClient.close();
} catch (Throwable var33) {
var7.addSuppressed(var33);
}
} else {
dbClient.close();
}
}
}
return count;
}
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
try {
MasterDuplicate md = new MasterDuplicate();
String duplicateId = rs.getString("duplicateId");
String masterId = rs.getString("masterId");
String masterName = rs.getString("masterName");
if (duplicateId.startsWith("eosc")) {
final String eoscDsId = getEoscDsId(duplicateId);
md.setEoscId(eoscDsId);
md.setGraphId(OafMapperUtils.createOpenaireId(10, masterId, true));
md.setGraphName(masterName);
return md;
}
return null;
} catch (SQLException var5) {
throw new RuntimeException(var5);
}
}
@NotNull
private static String getEoscDsId(String duplicateId) {
String eoscDsId = duplicateId.substring(duplicateId.lastIndexOf("::") + 2);
switch (eoscDsId) {
case "eosc.blue-cloud.44fa8dba8ad3ed19445227940032f31c":
eoscDsId = "eosc.blue-cloud.grsf";
break;
case "eosc.ror-org.24ef0000cfbf3ce7f3a40ba6b87e76ce":
eoscDsId = "eosc.ror-org.ror";
break;
case "eosc.clarin-eric.2aad8ade139792a49b130b539e1bb144":
eoscDsId = "eosc.clarin-eric.virtual_language_observatory";
break;
case "eosc.embl-ebi.e29a4e098afa05818957179f05d8e21d":
eoscDsId = "eosc.embl-ebi.icr";
break;
case "eosc.cyfronet.b59c2171d05ed9fb9e70a86d544f42a3":
eoscDsId = "eosc.cyfronet.rodbuk";
break;
case "eosc.eudat.9168f179ffab97584bf99a2729837545":
eoscDsId = "eosc.eudat.b2safe";
break;
case "eosc.oxford_e-research_centre.21697de1a5b10b8eb5fad857edecf5c9":
eoscDsId = "eosc.oxford_e-research_centre.fairsharing";
break;
case "eosc.inria.5923d0f31f0acda46cf4b592972284a2":
eoscDsId = "eosc.inria.software_heritage_archive";
break;
case "eosc.rli.661cdfdc74561b8eb69583b8137799d2":
eoscDsId = "eosc.rli.open_energy_platform";
break;
case "eosc.bbmri-eric.314cee7546a7489c2cc3ab79d34e2640":
eoscDsId = "eosc.bbmri-eric.bbmri-eric_directory";
break;
case "eosc.ku_leuven.68bf19ae7ee1bc7e3872255e96550c04":
eoscDsId = "eosc.ku_leuven.lirias";
break;
case "eosc.wenmr.d288225c333b07fc9d001da5c5392741":
eoscDsId = "eosc.wenmr.madomsi3sobm";
break;
case "eosc.zpid.b96341f00ca4c3a314abcc07fc0084b2":
eoscDsId = "eosc.zpid.psycharchives";
break;
case "eosc.vamdc.c967f669aa354e584e6786ee1d0c823e":
eoscDsId = "eosc.vamdc.vamdc_portal";
break;
case "eosc.openaire.2bb8710e1870170a175110615698e677":
eoscDsId = "eosc.openaire.openaire_scholexplorer";
break;
case "eosc.elixir-uk.5126ffcc8e23f65bbbe219d36128f2c8":
eoscDsId = "eosc.elixir-uk.workflowhub";
break;
case "eosc.vliz.61c6dae33d794d477e6a68ed43f52eb3":
eoscDsId = "eosc.vliz.worms";
break;
case "eosc.cern.8025243fa3c887159fc9b3930ae147c2":
eoscDsId = "eosc.cern.cod";
break;
case "eosc.hits.901e9baaa76d72017ebd7dfd93436caf":
eoscDsId = "eosc.hits.fairdomhub";
break;
case "eosc.bbmri-eric.8206c9aa93eb9513383218704570feb2":
eoscDsId = "eosc.bbmri-eric.bbmri-eric_crc-cohort";
break;
case "eosc.hn.02e4d980399d7142506e8aadb2b8e865":
eoscDsId = "eosc.hn.isidore";
break;
case "eosc.obsparis.9e98089baaf6af32fab3154873dfdfeb":
eoscDsId = "eosc.obsparis::eosc.obsparis.padc";
break;
case "eosc.esrf.ecc74ab09791c52aa238ee77ae988874":
eoscDsId = "eosc.esrf::eosc.esrf.tesrfdp";
break;
case "eosc.cessda-eric.7e17e8817404ce7a8013be373723b2be":
eoscDsId = "eosc.cessda-eric.cdc";
break;
case "eosc.psi.f1a79f572f95bc2fbea5cdc40ef4eb22":
eoscDsId = "eosc.psi.psi_public_data_repository";
break;
case "eosc.uniwersytet_opolski.19b44a96f7a776774de3939d9820d00c":
eoscDsId = "eosc.uniwersytet_opolski.bk_uniopole";
break;
case "eosc.lindatclariah-cz.6dc98fcb5294282acf3d92f3ab3376b2":
eoscDsId = "eosc.lindatclariah-cz.lindatclariah-cz_repository";
break;
case "eosc.eudat.17bb7bb8ef1af0f9bdb55a7db30cfa8a":
eoscDsId = "eosc.eudat.b2share";
break;
case "eosc.acdh-ch.3b0149bee976d6db7eef053159e97a87":
eoscDsId = "eosc.acdh-ch.arche";
break;
case "eosc.uit.49e8d4cef23bda3b66dd417e6675727d":
eoscDsId = "eosc.uit.trolling";
break;
case "eosc.csuc.135887d3dea4b6723095d13c28dd52a3":
eoscDsId = "eosc.csuc.corardr";
break;
case "eosc.ccsd.06cdd3ff4700bb4c8e7bf22c14f23f5b":
eoscDsId = "eosc.ccsd.episciences";
break;
case "eosc.gbif.14ac40283813a624bd74ae82605ded23":
eoscDsId = "eosc.gbif.gbif_species_occurrence_data";
break;
case "eosc.gdansk_tech.1434de11c83986b5be5592677f28d171":
eoscDsId = "eosc.gdansk_tech.most";
break;
case "eosc.gwdg.d6521479ffa922bbccc839606b8ec7c5":
eoscDsId = "eosc.gwdg.textgrid_repository";
break;
case "eosc.unipd.12d35bb1f56d4b91bb4644faf76d9486":
eoscDsId = "eosc.unipd.rdu";
break;
case "eosc.unibi-ub.a61d9ea844bdf43e6feabd6b14dfe3c5":
eoscDsId = "eosc.unibi-ub.pub";
break;
case "eosc.scipedia.0063745e5964b19c3e9ceeb2bd6632f5":
eoscDsId = "eosc.scipedia.spaosp";
break;
case "eosc.psnc.6f0470e3bb9203ec3a7553f3a72a7a1f":
eoscDsId = "eosc.psnc.rohub";
break;
case "eosc.ill.d422cba59746f39d10bdfea5c9cf8511":
eoscDsId = "eosc.ill.ill_data_portal";
break;
case "eosc.ceric-eric.e9354332fd75190b935b80c1ba30b837":
eoscDsId = "eosc.ceric-eric.ceric-data-portal";
break;
case "eosc.cnr_-_isti.dbe89d2b83f3e29caab7923a51c1d151":
eoscDsId = "eosc.cnr_-_isti.isti_open_portal";
break;
case "eosc.lapp.ef0bb7d889d0cced364444495f7a1e67":
eoscDsId = "eosc.lapp.ossr";
break;
case "eosc.lida.26c1ee137e7510fd1d7e44eb87cdb4af":
eoscDsId = "eosc.lida.lida_survey_data";
break;
case "eosc.awi_bremerhaven.2882af227241cb956c28fe321a70dfb2":
eoscDsId = "eosc.awi_bremerhaven.pangaea";
break;
case "eosc.riga_stradins_university.4ea61809e753e65a459bbe4a492c773b":
eoscDsId = "eosc.riga_stradins_university.rsu_dataverse";
break;
case "eosc.ku_leuven.1cb0937dc41e70d8126d7b259ad470af":
eoscDsId = "eosc.ku_leuven.ku_leuven_rdr";
break;
case "eosc.dkrz.9ffffb05aaf22e7f9138dca4560a8c8b":
eoscDsId = "eosc.dkrz.wdcc";
break;
case "eosc.openaire.0a02f13310296033694acead588a773b":
eoscDsId = "eosc.openaire.zenodo";
break;
case "eosc.vilnius-university.1ec069c1620d49d460e4cbcec0af57f6":
eoscDsId = "eosc.vilnius-university.tnoarda";
break;
case "eosc.icos_eric.25c5f3f0674fb287e05e697263e211e2":
eoscDsId = "eosc.icos_eric.data_discovery_and_access_portal";
break;
case "eosc.fris.8f42bfccf70de38b01763b704300f882":
eoscDsId = "eosc.fris.fris";
break;
}
return eoscDsId;
}
private static void writeMap(MasterDuplicate dm, BufferedWriter writer) {
if (dm == null)
return;
try {
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
writer.newLine();
} catch (IOException var3) {
throw new RuntimeException(var3);
}
}
}