dnet-and/dnet-feature-extraction/src/main/java/eu/dnetlib/featureextraction/util/Utilities.java

package eu.dnetlib.featureextraction.util;

import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.io.IOException;
import java.io.Serializable;
import java.text.Normalizer;
import java.util.List;

public class Utilities implements Serializable {

    public static String DATA_ID_FIELD = "$.id";

    static StructType inputSchema = new StructType(new StructField[]{
            new StructField("id", DataTypes.StringType, false, Metadata.empty()),
            new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
    });

    /**
     * Returns a view of the dataset including the id and the chosen field.
     *
     * @param sqlContext: the spark sql context
     * @param jsonRDD: the input dataset
     * @param inputFieldJPath: the input field jpath
     * @return the view of the dataset with normalized data of the inputField (id, inputField)
      */
    public static Dataset<Row> prepareDataset(SQLContext sqlContext, JavaRDD<String> jsonRDD, String inputFieldJPath) {

        JavaRDD<Row> rowRDD = jsonRDD
                .map(json ->
                        RowFactory.create(getJPathString(DATA_ID_FIELD, json), Utilities.normalize(getJPathString(inputFieldJPath, json))));
        return sqlContext.createDataFrame(rowRDD, inputSchema);
    }

    //returns the string value of the jpath in the given input json
    public static String getJPathString(final String jsonPath, final String inputJson) {
        try {
            Object o = JsonPath.read(inputJson, jsonPath);
            if (o instanceof String)
                return (String)o;
            if (o instanceof JSONArray && ((JSONArray)o).size()>0)
                return (String)((JSONArray)o).get(0);
            return "";
        }
        catch (Exception e) {
            return "";
        }
    }

    public static String normalize(final String s) {
        return Normalizer.normalize(s, Normalizer.Form.NFD)
                .replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters
                .replace('-', ' ') // Replace dashes with spaces
                .trim()  // trim leading/trailing whitespace (including what used to be leading/trailing dashes)
                .toLowerCase(); // Lowercase the final results
    }

    public static void writeLinesToHDFSFile(List<String> lines, String filePath) throws IOException {
        Configuration conf = new Configuration();

        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(filePath), true);

        try {
            fs = FileSystem.get(conf);

            Path outFile = new Path(filePath);
            // Verification
            if (fs.exists(outFile)) {
                System.out.println("Output file already exists");
                throw new IOException("Output file already exists");
            }

            // Create file to write
            FSDataOutputStream out = fs.create(outFile);
            try{
                for (String line: lines) {
                    out.writeBytes(line + "\n");
                }
            }
            finally {
                out.close();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
feature transformer implementation: lda model, count vectorizer and tokenizer 2023-04-03 09:41:46 +02:00			`package eu.dnetlib.featureextraction.util;`

			`import com.jayway.jsonpath.JsonPath;`
			`import net.minidev.json.JSONArray;`
			`import org.apache.hadoop.conf.Configuration;`
			`import org.apache.hadoop.fs.FSDataOutputStream;`
			`import org.apache.hadoop.fs.FileSystem;`
			`import org.apache.hadoop.fs.Path;`
			`import org.apache.spark.api.java.JavaRDD;`
			`import org.apache.spark.sql.Dataset;`
			`import org.apache.spark.sql.Row;`
			`import org.apache.spark.sql.RowFactory;`
			`import org.apache.spark.sql.SQLContext;`
			`import org.apache.spark.sql.types.DataTypes;`
			`import org.apache.spark.sql.types.Metadata;`
			`import org.apache.spark.sql.types.StructField;`
			`import org.apache.spark.sql.types.StructType;`

			`import java.io.IOException;`
			`import java.io.Serializable;`
			`import java.text.Normalizer;`
			`import java.util.List;`

			`public class Utilities implements Serializable {`

			`public static String DATA_ID_FIELD = "$.id";`

			`static StructType inputSchema = new StructType(new StructField[]{`
			`new StructField("id", DataTypes.StringType, false, Metadata.empty()),`
			`new StructField("sentence", DataTypes.StringType, false, Metadata.empty())`
			`});`

			`/**`
			`* Returns a view of the dataset including the id and the chosen field.`
			`*`
			`* @param sqlContext: the spark sql context`
			`* @param jsonRDD: the input dataset`
			`* @param inputFieldJPath: the input field jpath`
			`* @return the view of the dataset with normalized data of the inputField (id, inputField)`
			`*/`
			`public static Dataset<Row> prepareDataset(SQLContext sqlContext, JavaRDD<String> jsonRDD, String inputFieldJPath) {`

			`JavaRDD<Row> rowRDD = jsonRDD`
			`.map(json ->`
			`RowFactory.create(getJPathString(DATA_ID_FIELD, json), Utilities.normalize(getJPathString(inputFieldJPath, json))));`
			`return sqlContext.createDataFrame(rowRDD, inputSchema);`
			`}`

			`//returns the string value of the jpath in the given input json`
			`public static String getJPathString(final String jsonPath, final String inputJson) {`
			`try {`
			`Object o = JsonPath.read(inputJson, jsonPath);`
			`if (o instanceof String)`
			`return (String)o;`
			`if (o instanceof JSONArray && ((JSONArray)o).size()>0)`
			`return (String)((JSONArray)o).get(0);`
			`return "";`
			`}`
			`catch (Exception e) {`
			`return "";`
			`}`
			`}`

			`public static String normalize(final String s) {`
			`return Normalizer.normalize(s, Normalizer.Form.NFD)`
			`.replaceAll("[^\\w\\s-]", "") // Remove all non-word, non-space or non-dash characters`
			`.replace('-', ' ') // Replace dashes with spaces`
			`.trim() // trim leading/trailing whitespace (including what used to be leading/trailing dashes)`
			`.toLowerCase(); // Lowercase the final results`
			`}`

			`public static void writeLinesToHDFSFile(List<String> lines, String filePath) throws IOException {`
			`Configuration conf = new Configuration();`

			`FileSystem fs = FileSystem.get(conf);`
			`fs.delete(new Path(filePath), true);`

			`try {`
			`fs = FileSystem.get(conf);`

			`Path outFile = new Path(filePath);`
			`// Verification`
			`if (fs.exists(outFile)) {`
			`System.out.println("Output file already exists");`
			`throw new IOException("Output file already exists");`
			`}`

			`// Create file to write`
			`FSDataOutputStream out = fs.create(outFile);`
			`try{`
			`for (String line: lines) {`
			`out.writeBytes(line + "\n");`
			`}`
			`}`
			`finally {`
			`out.close();`
			`}`
			`} catch (IOException e) {`
			`e.printStackTrace();`
			`}`
			`}`
first commit and wf launcher implementation 2023-04-03 09:29:14 +02:00			`}`