[Explore Service | Services]: 1. extractUrlsFromSearch.ts: a. Do not query for results but for parametrized "resultType".
b. When final urls > 50000, create new files every 50000. 2. run.sh: Added $1 for argument "resultType".
This commit is contained in:
parent
f50db71c44
commit
6350a9f550
|
@ -13,11 +13,13 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
|
|||
|
||||
const request = require('superagent');
|
||||
|
||||
function get(resultsPerUrl) {
|
||||
function get(resultsPerUrl, resultsType) {
|
||||
setTimeout(() => {
|
||||
let searchFields = new SearchFields();
|
||||
let fieldIdsMap = searchFields.RESULT_FIELDS;
|
||||
|
||||
refineUrl += resultsType;
|
||||
|
||||
request.get(refineUrl, async function (err: any, refineResponse: any) {
|
||||
if (!refineResponse && err) {
|
||||
console.error("Error getting refine filters ",err);
|
||||
|
@ -27,8 +29,6 @@ function get(resultsPerUrl) {
|
|||
|
||||
let allUrls = new Set();
|
||||
|
||||
let promiseArray = [];
|
||||
|
||||
for (let key of keys) {
|
||||
// comment out for communities query check
|
||||
if(key == "community") {
|
||||
|
@ -59,7 +59,7 @@ function get(resultsPerUrl) {
|
|||
}
|
||||
}
|
||||
|
||||
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
|
||||
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl;
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
promiseArray.push(
|
||||
|
@ -68,7 +68,7 @@ function get(resultsPerUrl) {
|
|||
if (!response && err) {
|
||||
reject(err);
|
||||
} else {
|
||||
parseAllUrls(response, allUrls);
|
||||
parseAllUrls(response, allUrls, resultsType);
|
||||
resolve(value => value);
|
||||
}
|
||||
})
|
||||
|
@ -92,7 +92,7 @@ function get(resultsPerUrl) {
|
|||
if (err) {
|
||||
return console.log("Error appending in file "+fileName+": ", err);
|
||||
}
|
||||
console.timeEnd("total_time");
|
||||
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||
});
|
||||
}
|
||||
})
|
||||
|
@ -157,7 +157,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
|
|||
}
|
||||
|
||||
|
||||
function parseAllUrls(response: any, allUrls: any) {
|
||||
function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
||||
// let allUrls: any = [];
|
||||
|
||||
let responses: any = response.body['results'];
|
||||
|
@ -206,6 +206,13 @@ function parseAllUrls(response: any, allUrls: any) {
|
|||
noIndexedUrls++;
|
||||
fs.appendFileSync("./"+noIndexFileName, url+"\n");
|
||||
} else {
|
||||
if(finalUrls > 0 && ((finalUrls % 50000) == 0)) {
|
||||
console.log("url to be added in file: "+url);
|
||||
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
||||
console.log("\n");
|
||||
createSitemapFile(resultsType);
|
||||
}
|
||||
|
||||
finalUrls++;
|
||||
let urlPre = "\n<url><loc>";
|
||||
let urlSuf = "</loc></url>";
|
||||
|
@ -317,43 +324,47 @@ function getCommunities() {
|
|||
}).catch(error => console.error("Error getting communities ", error));
|
||||
}
|
||||
|
||||
function buildSiteMap(resultsPerUrl) {
|
||||
console.time("total_time");
|
||||
function buildSiteMap(resultsPerUrl, resultsType) {
|
||||
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||
|
||||
let date = new Date();
|
||||
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
|
||||
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
||||
noIndexFileName = "noIndex_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
||||
console.log("Buiding sitemap in file: "+fileName+"\n");
|
||||
|
||||
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
||||
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
|
||||
|
||||
fs.writeFile("./"+fileName, sitemap, function(err) {
|
||||
if(err) {
|
||||
return console.log("Error writing in file "+fileName+": ", err);
|
||||
}
|
||||
});
|
||||
createSitemapFile(resultsType);
|
||||
errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||
noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||
|
||||
getCommunities(); // comment out for communities query check
|
||||
get(resultsPerUrl);
|
||||
get(resultsPerUrl, resultsType);
|
||||
}
|
||||
|
||||
async function createSitemapFile(resultsType) {
|
||||
// let date = new Date();
|
||||
fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
||||
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
||||
+ ".xml";//+"_"+date.getTime();
|
||||
filesCreated++;
|
||||
|
||||
console.log("Buiding sitemap in file: " + fileName + "\n");
|
||||
|
||||
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
|
||||
fs.writeFileSync("./" + fileName, sitemap);
|
||||
}
|
||||
|
||||
let filesCreated = 0;
|
||||
let fileName;
|
||||
let errorFileName;
|
||||
let noIndexFileName;
|
||||
const fs = require('fs');
|
||||
|
||||
let promiseArray = [];
|
||||
let alreadyin = 0; // duplicate urls
|
||||
let notin= 0;
|
||||
let finalUrls = 0;
|
||||
let noIndexedUrls = 0;
|
||||
let urlsWithPid = 0;
|
||||
let publications = 0;
|
||||
let datasets = 0;
|
||||
let software = 0;
|
||||
let other = 0;
|
||||
let urlsWithPid = 0;
|
||||
|
||||
// comment out for communities query check
|
||||
let communitiesPromise;
|
||||
|
@ -362,10 +373,11 @@ let publicCommunities = [];
|
|||
|
||||
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
|
||||
const landingPrefix = "https://explore.openaire.eu/search/";
|
||||
// const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
||||
// let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
||||
|
||||
// comment out for communities query check
|
||||
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
||||
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
||||
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
||||
|
||||
buildSiteMap(150);
|
||||
// process.argc[3] is the "resultType" argument
|
||||
buildSiteMap(200, process.argv[3]);
|
||||
|
|
|
@ -1 +1 @@
|
|||
npx ts-node extractUrlsFromSearch.ts
|
||||
npx ts-node extractUrlsFromSearch.ts -- $1
|
||||
|
|
Loading…
Reference in New Issue