Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
5 changed files with 173 additions and 163 deletions
Showing only changes of commit bf35280ea6 - Show all commits

View File

@ -292,7 +292,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
} else if (value instanceof Result) { } else if (value instanceof Result) {
Result r = (Result) value; Result r = (Result) value;
if (Objects.nonNull(r.getFulltext()) && (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) || if (Objects.nonNull(r.getFulltext())
&& (ModelConstants.SOFTWARE_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()) ||
ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) { ModelConstants.DATASET_RESULTTYPE_CLASSID.equals(r.getResulttype().getClassid()))) {
r.setFulltext(null); r.setFulltext(null);
@ -326,10 +327,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (StringUtils.isBlank(r.getPublisher().getValue())) { if (StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null); r.setPublisher(null);
} else { } else {
r.getPublisher().setValue( r
r.getPublisher().getValue() .getPublisher()
.replaceAll(NAME_CLEANING_REGEX, " ") .setValue(
); r
.getPublisher()
.getValue()
.replaceAll(NAME_CLEANING_REGEX, " "));
} }
} }
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
@ -623,25 +627,28 @@ public class GraphCleaningFunctions extends CleaningFunctions {
private static Author cleanupAuthor(Author author) { private static Author cleanupAuthor(Author author) {
if (StringUtils.isNotBlank(author.getFullname())) { if (StringUtils.isNotBlank(author.getFullname())) {
author.setFullname( author
author.getFullname() .setFullname(
author
.getFullname()
.replaceAll(NAME_CLEANING_REGEX, " ") .replaceAll(NAME_CLEANING_REGEX, " ")
.replace("\"", "\\\"") .replace("\"", "\\\""));
);
} }
if (StringUtils.isNotBlank(author.getName())) { if (StringUtils.isNotBlank(author.getName())) {
author.setName( author
author.getName() .setName(
author
.getName()
.replaceAll(NAME_CLEANING_REGEX, " ") .replaceAll(NAME_CLEANING_REGEX, " ")
.replace("\"", "\\\"") .replace("\"", "\\\""));
);
} }
if (StringUtils.isNotBlank(author.getSurname())) { if (StringUtils.isNotBlank(author.getSurname())) {
author.setSurname( author
author.getSurname() .setSurname(
author
.getSurname()
.replaceAll(NAME_CLEANING_REGEX, " ") .replaceAll(NAME_CLEANING_REGEX, " ")
.replace("\"", "\\\"") .replace("\"", "\\\""));
);
} }
return author; return author;

View File

@ -18,7 +18,6 @@ package eu.dnetlib.pace.util;
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.net.URLEncoder; import java.net.URLEncoder;

View File

@ -1,13 +1,13 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static org.apache.spark.sql.functions.col;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import java.util.Arrays;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import java.util.Collections;
import eu.dnetlib.dhp.schema.oaf.Relation; import java.util.Iterator;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import java.util.Objects;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -18,16 +18,17 @@ import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2; import scala.Tuple2;
import scala.Tuple3; import scala.Tuple3;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.Objects;
import static org.apache.spark.sql.functions.col;
public class SparkPropagateRelation extends AbstractSparkAction { public class SparkPropagateRelation extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class);
@ -84,7 +85,8 @@ public class SparkPropagateRelation extends AbstractSparkAction {
.distinct() .distinct()
.cache(); .cache();
Dataset<Row> allRels = spark.read() Dataset<Row> allRels = spark
.read()
.schema(REL_BEAN_ENC.schema()) .schema(REL_BEAN_ENC.schema())
.json(DedupUtility.createEntityPath(graphBasePath, "relation")); .json(DedupUtility.createEntityPath(graphBasePath, "relation"));
@ -95,7 +97,8 @@ public class SparkPropagateRelation extends AbstractSparkAction {
.as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING())) .as(Encoders.tuple(REL_BEAN_ENC, Encoders.STRING(), Encoders.STRING()))
.flatMap(SparkPropagateRelation::addInferredRelations, REL_KRYO_ENC); .flatMap(SparkPropagateRelation::addInferredRelations, REL_KRYO_ENC);
Dataset<Relation> processedRelations = distinctRelations(dedupedRels.union(mergeRels.map((MapFunction<Relation, Relation>) r -> r, REL_KRYO_ENC))) Dataset<Relation> processedRelations = distinctRelations(
dedupedRels.union(mergeRels.map((MapFunction<Relation, Relation>) r -> r, REL_KRYO_ENC)))
.filter((FilterFunction<Relation>) r -> !Objects.equals(r.getSource(), r.getTarget())); .filter((FilterFunction<Relation>) r -> !Objects.equals(r.getSource(), r.getTarget()));
save(processedRelations, outputRelationPath, SaveMode.Overwrite); save(processedRelations, outputRelationPath, SaveMode.Overwrite);
@ -141,8 +144,7 @@ public class SparkPropagateRelation extends AbstractSparkAction {
.reduceGroups((ReduceFunction<Relation>) (b, a) -> { .reduceGroups((ReduceFunction<Relation>) (b, a) -> {
b.mergeFrom(a); b.mergeFrom(a);
return b; return b;
} })
)
.map((MapFunction<Tuple2<String, Relation>, Relation>) Tuple2::_2, REL_BEAN_ENC); .map((MapFunction<Tuple2<String, Relation>, Relation>) Tuple2::_2, REL_BEAN_ENC);
} }

View File

@ -1,14 +1,14 @@
package eu.dnetlib.dhp.oa.graph.group; package eu.dnetlib.dhp.oa.graph.group;
import com.fasterxml.jackson.databind.DeserializationFeature; import static org.junit.jupiter.api.Assertions.assertEquals;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport; import java.io.IOException;
import eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob; import java.net.URISyntaxException;
import eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob; import java.nio.file.Files;
import eu.dnetlib.dhp.schema.common.ModelSupport; import java.nio.file.Path;
import eu.dnetlib.dhp.schema.oaf.Result; import java.nio.file.Paths;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -19,13 +19,15 @@ import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import java.io.IOException; import com.fasterxml.jackson.databind.DeserializationFeature;
import java.net.URISyntaxException; import com.fasterxml.jackson.databind.ObjectMapper;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import static org.junit.jupiter.api.Assertions.assertEquals; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.merge.DispatchEntitiesSparkJob;
import eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.DHPUtils;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class GroupEntitiesSparkJobTest { public class GroupEntitiesSparkJobTest {