dnet-and/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/util/Utilities.java

103 lines
3.7 KiB
Java

package eu.dnetlib.featureextraction.util;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.io.IOException;
import java.io.Serializable;
import java.text.Normalizer;
import java.util.List;
public class Utilities implements Serializable {
public static String DATA_ID_FIELD = "$.id";
static StructType inputSchema = new StructType(new StructField[]{
new StructField("id", DataTypes.StringType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
/**
* Returns a view of the dataset including the id and the chosen field.
*
* @param sqlContext: the spark sql context
* @param jsonRDD: the input dataset
* @param inputFieldJPath: the input field jpath
* @return the view of the dataset with normalized data of the inputField (id, inputField)
*/
public static Dataset<Row> prepareDataset(SQLContext sqlContext, JavaRDD<String> jsonRDD, String inputFieldJPath) {
JavaRDD<Row> rowRDD = jsonRDD
.map(json ->
RowFactory.create(getJPathString(DATA_ID_FIELD, json), Utilities.normalize(getJPathString(inputFieldJPath, json))));
return sqlContext.createDataFrame(rowRDD, inputSchema);
}
//returns the string value of the jpath in the given input json
public static String getJPathString(final String jsonPath, final String inputJson) {
try {
Object o = JsonPath.read(inputJson, jsonPath);
if (o instanceof String)
return (String)o;
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
return (String)((JSONArray)o).get(0);
return "";
}
catch (Exception e) {
return "";
}
}
public static String normalize(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD)
.replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters
.replace('-', ' ') // Replace dashes with spaces
.trim() // trim leading/trailing whitespace (including what used to be leading/trailing dashes)
.toLowerCase(); // Lowercase the final results
}
public static void writeLinesToHDFSFile(List<String> lines, String filePath) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
fs.delete(new Path(filePath), true);
try {
fs = FileSystem.get(conf);
Path outFile = new Path(filePath);
// Verification
if (fs.exists(outFile)) {
System.out.println("Output file already exists");
throw new IOException("Output file already exists");
}
// Create file to write
FSDataOutputStream out = fs.create(outFile);
try{
for (String line: lines) {
out.writeBytes(line + "\n");
}
}
finally {
out.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}