[Explore Service | Services]: 1. extractUrlsFromSearch.ts: a. Do not query for results but for parametrized "resultType".

b. When final urls > 50000, create new files every 50000.
2. run.sh: Added $1 for argument "resultType".
This commit is contained in:
Konstantina Galouni 2021-08-10 12:57:25 +03:00
parent f50db71c44
commit 6350a9f550
2 changed files with 40 additions and 28 deletions

View File

@ -13,11 +13,13 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
const request = require('superagent'); const request = require('superagent');
function get(resultsPerUrl) { function get(resultsPerUrl, resultsType) {
setTimeout(() => { setTimeout(() => {
let searchFields = new SearchFields(); let searchFields = new SearchFields();
let fieldIdsMap = searchFields.RESULT_FIELDS; let fieldIdsMap = searchFields.RESULT_FIELDS;
refineUrl += resultsType;
request.get(refineUrl, async function (err: any, refineResponse: any) { request.get(refineUrl, async function (err: any, refineResponse: any) {
if (!refineResponse && err) { if (!refineResponse && err) {
console.error("Error getting refine filters ",err); console.error("Error getting refine filters ",err);
@ -27,8 +29,6 @@ function get(resultsPerUrl) {
let allUrls = new Set(); let allUrls = new Set();
let promiseArray = [];
for (let key of keys) { for (let key of keys) {
// comment out for communities query check // comment out for communities query check
if(key == "community") { if(key == "community") {
@ -59,7 +59,7 @@ function get(resultsPerUrl) {
} }
} }
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl; const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl;
await new Promise(resolve => setTimeout(resolve, 500)); await new Promise(resolve => setTimeout(resolve, 500));
promiseArray.push( promiseArray.push(
@ -68,7 +68,7 @@ function get(resultsPerUrl) {
if (!response && err) { if (!response && err) {
reject(err); reject(err);
} else { } else {
parseAllUrls(response, allUrls); parseAllUrls(response, allUrls, resultsType);
resolve(value => value); resolve(value => value);
} }
}) })
@ -92,7 +92,7 @@ function get(resultsPerUrl) {
if (err) { if (err) {
return console.log("Error appending in file "+fileName+": ", err); return console.log("Error appending in file "+fileName+": ", err);
} }
console.timeEnd("total_time"); console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
}); });
} }
}) })
@ -157,7 +157,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
} }
function parseAllUrls(response: any, allUrls: any) { function parseAllUrls(response: any, allUrls: any, resultsType: string) {
// let allUrls: any = []; // let allUrls: any = [];
let responses: any = response.body['results']; let responses: any = response.body['results'];
@ -206,6 +206,13 @@ function parseAllUrls(response: any, allUrls: any) {
noIndexedUrls++; noIndexedUrls++;
fs.appendFileSync("./"+noIndexFileName, url+"\n"); fs.appendFileSync("./"+noIndexFileName, url+"\n");
} else { } else {
if(finalUrls > 0 && ((finalUrls % 50000) == 0)) {
console.log("url to be added in file: "+url);
fs.appendFileSync("./" + fileName, "\n</urlset>");
console.log("\n");
createSitemapFile(resultsType);
}
finalUrls++; finalUrls++;
let urlPre = "\n<url><loc>"; let urlPre = "\n<url><loc>";
let urlSuf = "</loc></url>"; let urlSuf = "</loc></url>";
@ -317,43 +324,47 @@ function getCommunities() {
}).catch(error => console.error("Error getting communities ", error)); }).catch(error => console.error("Error getting communities ", error));
} }
function buildSiteMap(resultsPerUrl) { function buildSiteMap(resultsPerUrl, resultsType) {
console.time("total_time"); console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
let date = new Date(); let date = new Date();
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime(); createSitemapFile(resultsType);
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime(); errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
noIndexFileName = "noIndex_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime(); noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
console.log("Buiding sitemap in file: "+fileName+"\n");
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
fs.writeFile("./"+fileName, sitemap, function(err) {
if(err) {
return console.log("Error writing in file "+fileName+": ", err);
}
});
getCommunities(); // comment out for communities query check getCommunities(); // comment out for communities query check
get(resultsPerUrl); get(resultsPerUrl, resultsType);
} }
async function createSitemapFile(resultsType) {
// let date = new Date();
fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
+ ".xml";//+"_"+date.getTime();
filesCreated++;
console.log("Buiding sitemap in file: " + fileName + "\n");
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
fs.writeFileSync("./" + fileName, sitemap);
}
let filesCreated = 0;
let fileName; let fileName;
let errorFileName; let errorFileName;
let noIndexFileName; let noIndexFileName;
const fs = require('fs'); const fs = require('fs');
let promiseArray = [];
let alreadyin = 0; // duplicate urls let alreadyin = 0; // duplicate urls
let notin= 0; let notin= 0;
let finalUrls = 0; let finalUrls = 0;
let noIndexedUrls = 0; let noIndexedUrls = 0;
let urlsWithPid = 0;
let publications = 0; let publications = 0;
let datasets = 0; let datasets = 0;
let software = 0; let software = 0;
let other = 0; let other = 0;
let urlsWithPid = 0;
// comment out for communities query check // comment out for communities query check
let communitiesPromise; let communitiesPromise;
@ -362,10 +373,11 @@ let publicCommunities = [];
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json"; const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
const landingPrefix = "https://explore.openaire.eu/search/"; const landingPrefix = "https://explore.openaire.eu/search/";
// const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; // let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
// comment out for communities query check // comment out for communities query check
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
const contextUrl = "https://services.openaire.eu/openaire/contexts/"; const contextUrl = "https://services.openaire.eu/openaire/contexts/";
buildSiteMap(150); // process.argc[3] is the "resultType" argument
buildSiteMap(200, process.argv[3]);

View File

@ -1 +1 @@
npx ts-node extractUrlsFromSearch.ts npx ts-node extractUrlsFromSearch.ts -- $1