172 lines
6.6 KiB
Java
172 lines
6.6 KiB
Java
package eu.dnetlib.featureextraction;
|
|
|
|
import com.google.common.collect.Lists;
|
|
import com.jayway.jsonpath.JsonPath;
|
|
import net.minidev.json.JSONArray;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.hadoop.conf.Configuration;
|
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
import org.apache.hadoop.fs.Path;
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
import org.apache.spark.sql.Dataset;
|
|
import org.apache.spark.sql.Row;
|
|
import org.apache.spark.sql.RowFactory;
|
|
import org.apache.spark.sql.SQLContext;
|
|
import org.apache.spark.sql.types.DataTypes;
|
|
import org.apache.spark.sql.types.Metadata;
|
|
import org.apache.spark.sql.types.StructField;
|
|
import org.apache.spark.sql.types.StructType;
|
|
import com.ibm.icu.text.Transliterator;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.io.Serializable;
|
|
import java.math.BigDecimal;
|
|
import java.text.Normalizer;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
import java.util.stream.Collectors;
|
|
|
|
public class Utilities implements Serializable {
|
|
|
|
public static String DATA_ID_FIELD = "$.id";
|
|
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
|
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
|
|
|
static StructType inputSchema = new StructType(new StructField[]{
|
|
new StructField("id", DataTypes.StringType, false, Metadata.empty()),
|
|
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
|
|
});
|
|
|
|
/**
|
|
* Returns a view of the dataset including the id and the chosen field.
|
|
*
|
|
* @param sqlContext: the spark sql context
|
|
* @param jsonRDD: the input dataset
|
|
* @param inputFieldJPath: the input field jpath
|
|
* @return the view of the dataset with normalized data of the inputField (id, inputField)
|
|
*/
|
|
public static Dataset<Row> prepareDataset(SQLContext sqlContext, JavaRDD<String> jsonRDD, String inputFieldJPath) {
|
|
|
|
JavaRDD<Row> rowRDD = jsonRDD
|
|
.map(json ->
|
|
RowFactory.create(getJPathString(DATA_ID_FIELD, json), Utilities.normalize(getJPathString(inputFieldJPath, json))));
|
|
return sqlContext.createDataFrame(rowRDD, inputSchema);
|
|
}
|
|
|
|
//returns the string value of the jpath in the given input json
|
|
public static String getJPathString(final String jsonPath, final String inputJson) {
|
|
try {
|
|
Object o = JsonPath.read(inputJson, jsonPath);
|
|
if (o instanceof String)
|
|
return (String)o;
|
|
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
|
|
return (String)((JSONArray)o).get(0);
|
|
return "";
|
|
}
|
|
catch (Exception e) {
|
|
return "";
|
|
}
|
|
}
|
|
|
|
public static double[] getJPathArray(final String jsonPath, final String inputJson) {
|
|
try {
|
|
Object o = JsonPath.read(inputJson, jsonPath);
|
|
if (o instanceof double[])
|
|
return (double[]) o;
|
|
if (o instanceof JSONArray) {
|
|
Object[] objects = ((JSONArray) o).toArray();
|
|
double[] array = new double[objects.length];
|
|
for (int i = 0; i < objects.length; i++) {
|
|
if (objects[i] instanceof BigDecimal)
|
|
array[i] = ((BigDecimal)objects[i]).doubleValue();
|
|
else
|
|
array[i] = (double) objects[i];
|
|
}
|
|
return array;
|
|
}
|
|
return new double[0];
|
|
}
|
|
catch (Exception e) {
|
|
e.printStackTrace();
|
|
return new double[0];
|
|
}
|
|
}
|
|
|
|
// public static String normalize(final String s) {
|
|
// return Normalizer.normalize(s, Normalizer.Form.NFD)
|
|
// .replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters
|
|
// .replace('-', ' ') // Replace dashes with spaces
|
|
// .trim() // trim leading/trailing whitespace (including what used to be leading/trailing dashes)
|
|
// .toLowerCase(); // Lowercase the final results
|
|
// }
|
|
|
|
public static void writeLinesToHDFSFile(List<String> lines, String filePath) throws IOException {
|
|
Configuration conf = new Configuration();
|
|
|
|
FileSystem fs = FileSystem.get(conf);
|
|
fs.delete(new Path(filePath), true);
|
|
|
|
try {
|
|
fs = FileSystem.get(conf);
|
|
|
|
Path outFile = new Path(filePath);
|
|
// Verification
|
|
if (fs.exists(outFile)) {
|
|
System.out.println("Output file already exists");
|
|
throw new IOException("Output file already exists");
|
|
}
|
|
|
|
// Create file to write
|
|
FSDataOutputStream out = fs.create(outFile);
|
|
try{
|
|
for (String line: lines) {
|
|
out.writeBytes(line + "\n");
|
|
}
|
|
}
|
|
finally {
|
|
out.close();
|
|
}
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
public static String unicodeNormalization(final String s) {
|
|
|
|
Matcher m = Pattern.compile("\\\\u(\\p{XDigit}{4})").matcher(s);
|
|
StringBuffer buf = new StringBuffer(s.length());
|
|
while (m.find()) {
|
|
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
|
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
|
}
|
|
m.appendTail(buf);
|
|
return buf.toString();
|
|
}
|
|
|
|
|
|
public static String normalize(final String s) {
|
|
return fixAliases(Normalizer.normalize(unicodeNormalization(s), Normalizer.Form.NFD))
|
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
|
.replaceAll("[^ \\w]+", "")
|
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
|
.replaceAll("(\\p{Punct})+", " ")
|
|
.replaceAll("(\\d)+", " ")
|
|
.replaceAll("(\\n)+", " ")
|
|
.toLowerCase()
|
|
.trim();
|
|
}
|
|
|
|
protected static String fixAliases(final String s) {
|
|
final StringBuilder sb = new StringBuilder();
|
|
for (final char ch : Lists.charactersOf(s)) {
|
|
final int i = StringUtils.indexOf(aliases_from, ch);
|
|
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
|
|
}
|
|
return sb.toString();
|
|
}
|
|
}
|