nlphub/src/main/java/org/gcube/data/analysis/nlphub/nlp/NlpUtils.java

70 lines
3.7 KiB
Java
Executable File

package org.gcube.data.analysis.nlphub.nlp;
public class NlpUtils {
public static String getLanguageRecognizerDigest(String content) {
int minToken = 100;
content = content.trim();
String[] tokens = content.split("\\s");
String digest = "";
int len = (minToken <= tokens.length) ? minToken : tokens.length;
for(int i=0; i<len; i++) {
digest += tokens[i] + " ";
}
return escapeContent(digest.trim());
}
public static int countTokens(String content) {
return content.split("\\s").length;
}
public static String escapeContent(String content) {
content = content.replaceAll("\\\\", " ");
content = content.replaceAll("\"", " ");
content = content.replaceAll(";", " ");
content = content.replaceAll("=", " ");
return content;
}
public static String replaceDirtyCharacters(String source) {
char c = 0;
for (int i = 0; i < source.length(); i++) {
c = source.charAt(i);
if (!((c >= 33 && c <= 90) || (c >= 97 && c <= 122) || (c >= 128 && c <= 167) || (c >= 180 && c <= 183) || (c >= 210 && c <= 212) || (c >= 214 && c <= 216) || (c >= 224 && c<=255))) {
source = source.replace(source.substring(i, i + 1), " ");
}
}
source = source.replaceAll("[\\s]+", " ").trim();
source = source.replaceAll("<", " ").trim();
source = source.replaceAll(">", " ").trim();
return source;
}
/*
public static void main(String[] args) {
String text = "Per me si va nella Città dolente.\n Per me si va tra la perduta Gente";
text = "North Korea has agreed to send a delegation to next month's Winter Olympics in South Korea, the first notable breakthrough to come out of a face-to-face meeting Tuesday between the neighboring nations.";
text += "In talks, held at the border village of Panmunjom or \"truce village,\" in the Korean peninsula\'s heavily fortified demilitarized zone, North Korea negotiators agreed to send a \"high-level delegation\" comprising athletes, a cheering squad, an art troupe, a visitors\' group, a Taekwondo demonstration team and a press corps, South Korea\'s Unification Ministry told reporters in Seoul.";
text += "Unification Vice Minister Chun Hae-sung also announced that both sides plan to re-open a military hotline on the western Korean Peninsula.";
text += "The hotline was one of many that were closed as inter-Korean relations soured.";
text += "The hotline was one of many that were closed as inter-Korean relations soured.";
text += "The hotline was one of many that were closed as inter-Korean relations soured.";
text += "The hotline was one of many that were closed as inter-Korean relations soured.";
text += "The hotline was one of many that were closed as inter-Korean relations soured.";
text += "The hotline was one of many that were closed as inter-Korean relations soured.";
text += "The hotline was one of many that were closed as inter-Korean relations soured.";
text += "In talks, held at the border village of Panmunjom or \"truce village,\" in the Korean peninsula\'s heavily fortified demilitarized zone, North Korea negotiators agreed to send a \"high-level delegation\" comprising athletes, a cheering squad, an art troupe, a visitors\' group, a Taekwondo demonstration team and a press corps, South Korea\'s Unification Ministry told reporters in Seoul.";
text += "Unification Vice Minister Chun Hae-sung also announced that both sides plan to re-open a military hotline on the western Korean Peninsula.";
//text = " Tutti i modelli meteo sono d'accordo, \\puntiamo su una rotta poco comune, che non ho mai fatto, ma che dovrebbe funzionare bene\"";
//text = "A me piace la zuppa, a me piace la pasta, a me piace il formaggio, a me piace la panna. A me piace la cioccolata.";
System.out.println(getLanguageRecognizerDigest(text));
}*/
}