dnet-and/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/Utilities.java

172 lines
6.6 KiB
Java

package eu.dnetlib.featureextraction;
import com.google.common.collect.Lists;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import com.ibm.icu.text.Transliterator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.math.BigDecimal;
import java.text.Normalizer;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class Utilities implements Serializable {
public static String DATA_ID_FIELD = "$.id";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
static StructType inputSchema = new StructType(new StructField[]{
new StructField("id", DataTypes.StringType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
/**
* Returns a view of the dataset including the id and the chosen field.
*
* @param sqlContext: the spark sql context
* @param jsonRDD: the input dataset
* @param inputFieldJPath: the input field jpath
* @return the view of the dataset with normalized data of the inputField (id, inputField)
*/
public static Dataset<Row> prepareDataset(SQLContext sqlContext, JavaRDD<String> jsonRDD, String inputFieldJPath) {
JavaRDD<Row> rowRDD = jsonRDD
.map(json ->
RowFactory.create(getJPathString(DATA_ID_FIELD, json), Utilities.normalize(getJPathString(inputFieldJPath, json))));
return sqlContext.createDataFrame(rowRDD, inputSchema);
}
//returns the string value of the jpath in the given input json
public static String getJPathString(final String jsonPath, final String inputJson) {
try {
Object o = JsonPath.read(inputJson, jsonPath);
if (o instanceof String)
return (String)o;
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
return (String)((JSONArray)o).get(0);
return "";
}
catch (Exception e) {
return "";
}
}
public static double[] getJPathArray(final String jsonPath, final String inputJson) {
try {
Object o = JsonPath.read(inputJson, jsonPath);
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal)objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
}
catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
// public static String normalize(final String s) {
// return Normalizer.normalize(s, Normalizer.Form.NFD)
// .replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters
// .replace('-', ' ') // Replace dashes with spaces
// .trim() // trim leading/trailing whitespace (including what used to be leading/trailing dashes)
// .toLowerCase(); // Lowercase the final results
// }
public static void writeLinesToHDFSFile(List<String> lines, String filePath) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
fs.delete(new Path(filePath), true);
try {
fs = FileSystem.get(conf);
Path outFile = new Path(filePath);
// Verification
if (fs.exists(outFile)) {
System.out.println("Output file already exists");
throw new IOException("Output file already exists");
}
// Create file to write
FSDataOutputStream out = fs.create(outFile);
try{
for (String line: lines) {
out.writeBytes(line + "\n");
}
}
finally {
out.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static String unicodeNormalization(final String s) {
Matcher m = Pattern.compile("\\\\u(\\p{XDigit}{4})").matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
public static String normalize(final String s) {
return fixAliases(Normalizer.normalize(unicodeNormalization(s), Normalizer.Form.NFD))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.toLowerCase()
.trim();
}
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
}
return sb.toString();
}
}