diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index da2ba4723..87c935d83 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -44,6 +44,7 @@ import java.util.Date; import java.util.List; import java.util.function.Consumer; import java.util.function.Function; +import java.util.function.Predicate; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -53,6 +54,7 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; +import eu.dnetlib.dhp.oa.graph.raw.common.VerifyNsPrefixPredicate; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -113,6 +115,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i final String hdfsPath = parser.get("hdfsPath"); log.info("hdfsPath: {}", hdfsPath); + final String nsPrefixBlacklist = parser.get("nsPrefixBlacklist"); + log.info("nsPrefixBlacklist: {}", nsPrefixBlacklist); + + final Predicate verifyNamespacePrefix = new VerifyNsPrefixPredicate(nsPrefixBlacklist); + final boolean processClaims = parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); log.info("processClaims: {}", processClaims); @@ -123,23 +130,25 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i smdbe.execute("queryClaims.sql", smdbe::processClaims); } else { log.info("Processing datasources..."); - smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource, verifyNamespacePrefix); log.info("Processing projects..."); if (dbSchema.equalsIgnoreCase("beta")) { - smdbe.execute("queryProjects.sql", smdbe::processProject); + smdbe.execute("queryProjects.sql", smdbe::processProject, verifyNamespacePrefix); } else { - smdbe.execute("queryProjects_production.sql", smdbe::processProject); + smdbe.execute("queryProjects_production.sql", smdbe::processProject, verifyNamespacePrefix); } log.info("Processing orgs..."); - smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization, verifyNamespacePrefix); log.info("Processing relationsNoRemoval ds <-> orgs ..."); - smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + smdbe + .execute( + "queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization, verifyNamespacePrefix); log.info("Processing projects <-> orgs ..."); - smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization, verifyNamespacePrefix); } log.info("All done."); } @@ -163,10 +172,20 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i } public void execute(final String sqlFile, final Function> producer) + throws Exception { + execute(sqlFile, producer, oaf -> true); + } + + public void execute(final String sqlFile, final Function> producer, + final Predicate predicate) throws Exception { final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); - final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); + final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> { + if (predicate.test(oaf)) { + emitOaf(oaf); + } + }); dbClient.processResults(sql, consumer); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicate.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicate.java new file mode 100644 index 000000000..1e99d298d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicate.java @@ -0,0 +1,62 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import java.util.HashSet; +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Splitter; + +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Relation; + +/** + * This predicate should be used to skip oaf objects using a blacklist of nsprefixes. + * + * @author michele + */ +public class VerifyNsPrefixPredicate implements Predicate { + + final Set invalids = new HashSet<>(); + + public VerifyNsPrefixPredicate(final String blacklist) { + if (StringUtils.isNotBlank(blacklist)) { + Splitter + .on(",") + .trimResults() + .omitEmptyStrings() + .split(blacklist) + .forEach(invalids::add); + } + } + + @Override + public boolean test(final Oaf oaf) { + if (oaf instanceof Datasource) { + return testValue(((Datasource) oaf).getNamespaceprefix().getValue()); + } else if (oaf instanceof OafEntity) { + return testValue(((OafEntity) oaf).getId()); + } else if (oaf instanceof Relation) { + return testValue(((Relation) oaf).getSource()) && testValue(((Relation) oaf).getTarget()); + } else { + return true; + } + } + + protected boolean testValue(final String s) { + if (StringUtils.isNotBlank(s)) { + for (final String invalid : invalids) { + if (Pattern.matches("^(\\d\\d\\|)?" + invalid + ".*$", s)) { + return false; + } + } + } + return true; + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json index 6dfef32db..b23ac6546 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json @@ -40,5 +40,11 @@ "paramLongName": "dbschema", "paramDescription": "the database schema according to the D-Net infrastructure (beta or production)", "paramRequired": true + }, + { + "paramName": "nsbl", + "paramLongName": "nsPrefixBlacklist", + "paramDescription": "a blacklist of nsprefixes (comma separeted)", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index d8b61b5ea..d8146d9a2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -43,7 +43,11 @@ isLookupUrl the address of the lookUp service - + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -131,6 +135,7 @@ --isLookupUrl${isLookupUrl} --actionclaims --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} @@ -182,6 +187,7 @@ --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml index 66eaeeb26..4c319d037 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_claims/oozie_app/workflow.xml @@ -38,7 +38,11 @@ isLookupUrl the address of the lookUp service - + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -113,6 +117,7 @@ --isLookupUrl${isLookupUrl} --actionclaims --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml index eea8d0a5a..29d4269ef 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_db/oozie_app/workflow.xml @@ -25,7 +25,11 @@ isLookupUrl the address of the lookUp service - + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -99,6 +103,7 @@ --postgresPassword${postgresPassword} --isLookupUrl${isLookupUrl} --dbschema${dbSchema} + --nsPrefixBlacklist${nsPrefixBlacklist} @@ -117,6 +122,7 @@ --isLookupUrl${isLookupUrl} --dbschema${dbSchema} --actionclaims + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml index 868418152..9b68cfb05 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_step1/oozie_app/workflow.xml @@ -28,6 +28,11 @@ isLookupUrl the address of the lookUp service + + nsPrefixBlacklist + + a blacklist of nsprefixes (comma separeted) + sparkDriverMemory memory for driver process @@ -67,6 +72,7 @@ -pguser${postgresUser} -pgpasswd${postgresPassword} -islookup${isLookupUrl} + --nsPrefixBlacklist${nsPrefixBlacklist} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicateTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicateTest.java new file mode 100644 index 000000000..a14fb4ae3 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/common/VerifyNsPrefixPredicateTest.java @@ -0,0 +1,92 @@ + +package eu.dnetlib.dhp.oa.graph.raw.common; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; + +class VerifyNsPrefixPredicateTest { + + private VerifyNsPrefixPredicate predicate; + + @BeforeEach + void setUp() throws Exception { + predicate = new VerifyNsPrefixPredicate("corda,nsf,wt"); + } + + @Test + void testTestValue() { + assertFalse(predicate.testValue("corda__2020")); + assertFalse(predicate.testValue("nsf________")); + assertFalse(predicate.testValue("nsf")); + assertFalse(predicate.testValue("corda")); + assertFalse(predicate.testValue("10|corda_______::fjkdsfjksdhfksj")); + assertFalse(predicate.testValue("20|corda_______::fjkdsfjksdhfksj")); + + assertTrue(predicate.testValue("xxxxxx_____")); + assertTrue(predicate.testValue("10|xxxxxx_____::sdasdasaddasad")); + + assertTrue(predicate.testValue(null)); + assertTrue(predicate.testValue("")); + } + + @Test + void testTest_ds_true() { + final Field prefix = new Field<>(); + prefix.setValue("xxxxxx______"); + + final Datasource ds = new Datasource(); + ds.setNamespaceprefix(prefix); + + assertTrue(predicate.test(ds)); + } + + @Test + void testTest_ds_false() { + final Field prefix = new Field<>(); + prefix.setValue("corda__2020"); + + final Datasource ds = new Datasource(); + ds.setNamespaceprefix(prefix); + + assertFalse(predicate.test(ds)); + } + + @Test + void testTest_rel_true() { + final Relation rel = new Relation(); + rel.setSource("10|yyyyyy______:sdfsfsffsdfs"); + rel.setTarget("10|xxxxxx______:sdfsfsffsdfs"); + assertTrue(predicate.test(rel)); + } + + @Test + void testTest_rel_false() { + final Relation rel = new Relation(); + rel.setSource("10|corda_______:sdfsfsffsdfs"); + rel.setTarget("10|xxxxxx______:sdfsfsffsdfs"); + assertFalse(predicate.test(rel)); + } + + @Test + void testTest_proj_true() { + final Project p = new Project(); + p.setId("10|xxxxxx______:sdfsfsffsdfs"); + assertTrue(predicate.test(p)); + } + + @Test + void testTest_proj_false() { + final Project p = new Project(); + p.setId("10|corda_____:sdfsfsffsdfs"); + assertFalse(predicate.test(p)); + } + +}