diff --git a/services/sitempas/extractUrlsFromSearch.ts b/services/sitempas/extractUrlsFromSearch.ts new file mode 100644 index 00000000..48186923 --- /dev/null +++ b/services/sitempas/extractUrlsFromSearch.ts @@ -0,0 +1,260 @@ +'use strict'; + +import {properties} from "../../explore/src/environments/environment"; +import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service"; +import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview"; + +import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class"; +import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields"; +import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service"; + +let express = require('express'); +let app = express(); +const request = require('superagent'); +const URL = require('url'); +let cors = require('cors'); +app.use(cors()); + +function get(resultsPerUrl) { + setTimeout(() => { + let searchFields = new SearchFields(); + let fieldIdsMap = searchFields.RESULT_FIELDS; + + request.get(refineUrl, async function (err: any, refineResponse: any) { + if (!refineResponse && err) { + console.error("Error getting refine filters ",err); + } else { + let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null; + console.log("number of keys: " + keys.length); + + let allUrls = new Set(); + + var promiseArray = []; + + for (let key of keys) { + if(key == "community") { + await communitiesPromise; + } + console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length); + + for (let value of refineResponse.body['refineResults'][key]) { + if(!value || !value.name || !value.id + || value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available') + || value.name == "unidentified" || value.name == "Undetermined") { + console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value)); + continue; + } + + if(key=="community") { + let valueId = ""; + if(value.id) { + let idArray = value.id.split("||"); + if(idArray) { + valueId = idArray[0]; + } + } + if(!valueId || !publicCommunities.includes(valueId)) { + console.log("hidden community: "+valueId); + continue; + } + } + + const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl; + + promiseArray.push(new Promise((resolve, reject) => { + request.get(url, function (err: any, response: any) { + if (!response && err) { + console.error("Error getting results ", err); + fs.appendFileSync("./"+errorFileName, "no response "+url); + fs.appendFileSync("./"+errorFileName, err); + fs.appendFileSync("./"+errorFileName, "\n"); + reject(); + } else { + parseAllUrls(response, allUrls); + resolve(); + } + }) + })); + } + console.log(""); + } + + await Promise.all(promiseArray); + console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin); + + fs.appendFile("./" + fileName, "\n", function (err) { + if (err) { + return console.log("Error appending in file "+fileName+": ", err); + } + console.timeEnd("total_time"); + }); + } + }) + }) +} +// }); + +function parseAllUrls(response: any, allUrls: any) { + // let allUrls: any = []; + + let responses: any = response.body['results']; + let searchResearchResultsService: any = new SearchResearchResultsService(); + + // if(responses) { + // let length = Array.isArray(responses) ? responses.length : 1; + // for (let i = 0; i < length; i++) { + // let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result']; + // + // if (resData['pid']) { + // if (!Array.isArray(resData['pid'])) { + // if (resData['pid'].classid && resData['pid'].classid == 'doi') { + // if (resData['pid'].content != '' && resData['pid'].content != null) { + // console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content)); + // resData['pid'].content.replace("https://doi.org/", ""); + // } + // } + // } + // } + // } + // } + + + let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties); + if(searchResults.length < 100 && searchResults.length > 0) { + console.log("num of results: "+searchResults.length + " " + response.request.url); + } + + if(searchResults.length == 0) { + fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n"); + } + for(let j=0; j"; + let urlSuf = "\n" + + " "; + fs.appendFileSync("./"+fileName, urlPre + url + urlSuf); + notin++; + } + } + return allUrls; +} + + +// function parseAllUrls1(response) { +// let allUrls = []; +// +// let responses = response.body['results']; +// let length = Array.isArray(responses) ? responses.length : 1; +// +// for (let i = 0; i < length; i++) { +// let p = new parsingFunctions.ParsingFunctions(); +// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result']; +// +// let type = "result"; +// if (resData['resulttype']) { +// type = resData['resulttype']['classname']; +// } +// +// if (resData['pid']) { +// let identifiers = p.parseIdentifiers(resData['pid']); +// let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers); +// +// if(pid && pid.id) { +// allUrls[i] = getUrlByType(type, pid, pid.id); +// } else { +// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result"); +// allUrls[i] = getUrlByType(type, null, canId); +// } +// } else { +// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result"); +// allUrls[i] = getUrlByType(type, null, canId); +// } +// } +// return allUrls; +// } +// +function getUrlByType(type: any, pid: any, id: any) { + let parameter: any = ""; + + if (type === "publication") { + parameter = "articleId"; + } else if (type === "dataset") { + parameter = "datasetId"; + } else if (type === "software") { + parameter = "softwareId"; + } else if (type === "other") { + parameter = "orpId"; + } else { + parameter = "id"; + } + if(pid) { + parameter = "pid"; + } + return landingPrefix+type+"?"+parameter+"="+id; +} + +function getCommunities() { + communitiesPromise = new Promise((resolve, reject) => { + resolve(); + request.get(contextUrl, async function (err: any, communitiesResponse: any) { + if (!communitiesResponse && err) { + console.error("Error getting communities ", err); + } else { + const contextsService = new ContextsService(); + publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id); + } + }) + }); +} + +function buildSiteMap(resultsPerUrl) { + console.time("total_time"); + + let date = new Date(); + fileName = "sitemap_"+date.getFullYear()+"_"+date.getMonth()+"_"+date.getDate()+".xml";//+"_"+date.getTime(); + errorFileName = "error_"+date.getFullYear()+"_"+date.getMonth()+"_"+date.getDate();//+"_"+date.getTime(); + console.log("Buiding sitemap in file: "+fileName+"\n"); + + let sitemap = "\n" + + ""; + + fs.writeFile("./"+fileName, sitemap, function(err) { + if(err) { + return console.log("Error writing in file "+fileName+": ", err); + } + }); + + getCommunities(); + get(resultsPerUrl); +} + + +var fileName; +var errorFileName; +const fs = require('fs'); + +var alreadyin = 0; // duplicate urls +var notin= 0; + +var communitiesPromise; +var publicCommunities = []; + +const refineUrl = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; +const resultsUrlPrefix = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json"; +const landingPrefix = "https://beta.explore.openaire.eu/search/"; +const contextUrl = "https://beta.services.openaire.eu/openaire/contexts/"; + +buildSiteMap(100); diff --git a/services/sitempas/package.json b/services/sitempas/package.json new file mode 100644 index 00000000..705892e0 --- /dev/null +++ b/services/sitempas/package.json @@ -0,0 +1,23 @@ +{ + "name": "urls_for_sitemap", + "version": "1.0.0", + "description": "Caching in memory", + "main": "cache.js", + "scripts": { + "start": "PORT=3100 node extractUrlsFromSearch.js" + }, + "dependencies": { + "cors": "^2.8.5", + "express": "^4.17.1", + "superagent": "^5.0.5" + }, + "devDependencies": { + "typescript": "3.2.4", + "@types/node": "^8.0.30" + }, + "engines": { + "node": "8.1.x" + }, + "author": "Konstantina Galouni ", + "license": "NKUA" +} diff --git a/services/sitempas/run.sh b/services/sitempas/run.sh new file mode 100755 index 00000000..44b301b1 --- /dev/null +++ b/services/sitempas/run.sh @@ -0,0 +1 @@ +npx ts-node extractUrlsFromSearch.ts diff --git a/services/sitempas/tsconfig.json b/services/sitempas/tsconfig.json new file mode 100644 index 00000000..ff2d82a7 --- /dev/null +++ b/services/sitempas/tsconfig.json @@ -0,0 +1,10 @@ +{ + "compilerOptions": { + "typeRoots": [ + "node_modules/@types" + ], + "noImplicitAny": false, + "emitDecoratorMetadata": true, + "experimentalDecorators": true + } +}