2023-04-03 09:41:46 +02:00
|
|
|
package eu.dnetlib.featureextraction.util;
|
|
|
|
|
|
|
|
import com.jayway.jsonpath.JsonPath;
|
|
|
|
import net.minidev.json.JSONArray;
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
|
|
import org.apache.spark.sql.Dataset;
|
|
|
|
import org.apache.spark.sql.Row;
|
|
|
|
import org.apache.spark.sql.RowFactory;
|
|
|
|
import org.apache.spark.sql.SQLContext;
|
|
|
|
import org.apache.spark.sql.types.DataTypes;
|
|
|
|
import org.apache.spark.sql.types.Metadata;
|
|
|
|
import org.apache.spark.sql.types.StructField;
|
|
|
|
import org.apache.spark.sql.types.StructType;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.Serializable;
|
|
|
|
import java.text.Normalizer;
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
public class Utilities implements Serializable {
|
|
|
|
|
|
|
|
public static String DATA_ID_FIELD = "$.id";
|
|
|
|
|
|
|
|
static StructType inputSchema = new StructType(new StructField[]{
|
|
|
|
new StructField("id", DataTypes.StringType, false, Metadata.empty()),
|
|
|
|
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
|
|
|
|
});
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Returns a view of the dataset including the id and the chosen field.
|
|
|
|
*
|
|
|
|
* @param sqlContext: the spark sql context
|
|
|
|
* @param jsonRDD: the input dataset
|
|
|
|
* @param inputFieldJPath: the input field jpath
|
|
|
|
* @return the view of the dataset with normalized data of the inputField (id, inputField)
|
|
|
|
*/
|
|
|
|
public static Dataset<Row> prepareDataset(SQLContext sqlContext, JavaRDD<String> jsonRDD, String inputFieldJPath) {
|
|
|
|
|
|
|
|
JavaRDD<Row> rowRDD = jsonRDD
|
|
|
|
.map(json ->
|
|
|
|
RowFactory.create(getJPathString(DATA_ID_FIELD, json), Utilities.normalize(getJPathString(inputFieldJPath, json))));
|
|
|
|
return sqlContext.createDataFrame(rowRDD, inputSchema);
|
|
|
|
}
|
|
|
|
|
|
|
|
//returns the string value of the jpath in the given input json
|
|
|
|
public static String getJPathString(final String jsonPath, final String inputJson) {
|
|
|
|
try {
|
|
|
|
Object o = JsonPath.read(inputJson, jsonPath);
|
|
|
|
if (o instanceof String)
|
|
|
|
return (String)o;
|
|
|
|
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
|
|
|
|
return (String)((JSONArray)o).get(0);
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
catch (Exception e) {
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public static String normalize(final String s) {
|
|
|
|
return Normalizer.normalize(s, Normalizer.Form.NFD)
|
|
|
|
.replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters
|
|
|
|
.replace('-', ' ') // Replace dashes with spaces
|
|
|
|
.trim() // trim leading/trailing whitespace (including what used to be leading/trailing dashes)
|
|
|
|
.toLowerCase(); // Lowercase the final results
|
|
|
|
}
|
|
|
|
|
|
|
|
public static void writeLinesToHDFSFile(List<String> lines, String filePath) throws IOException {
|
|
|
|
Configuration conf = new Configuration();
|
|
|
|
|
|
|
|
FileSystem fs = FileSystem.get(conf);
|
|
|
|
fs.delete(new Path(filePath), true);
|
|
|
|
|
|
|
|
try {
|
|
|
|
fs = FileSystem.get(conf);
|
|
|
|
|
|
|
|
Path outFile = new Path(filePath);
|
|
|
|
// Verification
|
|
|
|
if (fs.exists(outFile)) {
|
|
|
|
System.out.println("Output file already exists");
|
|
|
|
throw new IOException("Output file already exists");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create file to write
|
|
|
|
FSDataOutputStream out = fs.create(outFile);
|
|
|
|
try{
|
|
|
|
for (String line: lines) {
|
|
|
|
out.writeBytes(line + "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
finally {
|
|
|
|
out.close();
|
|
|
|
}
|
|
|
|
} catch (IOException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
}
|
|
|
|
}
|
2023-04-03 09:29:14 +02:00
|
|
|
}
|