[Explore]: services/sitemaps: In sitemaps creation, added case for querying with subjects (instead of refine).
1. extractUrlsFromSearch.ts: a. Added input read for result type and for subject (can be given as empty). b. Added creation of sitemaps for results related to a specific subject (many methods are parametrized accordingly). 2. run.sh: Added a second parameter for subject. 3. .gitignore: Updated gitignore file to ignore sitemaps created locally in commit process.
This commit is contained in:
parent
c268a3ae2c
commit
2209804919
|
@ -4,3 +4,7 @@
|
||||||
**/.idea
|
**/.idea
|
||||||
**/dist
|
**/dist
|
||||||
**/uploads
|
**/uploads
|
||||||
|
services/sitemaps/*.xml
|
||||||
|
services/sitemaps/*.txt
|
||||||
|
services/sitemaps/sitemaps
|
||||||
|
services/sitemaps/sitemaps.zip
|
|
@ -13,6 +13,62 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
|
||||||
|
|
||||||
const request = require('superagent');
|
const request = require('superagent');
|
||||||
|
|
||||||
|
function getForSubject(resultsPerUrl, resultsType, subject) {
|
||||||
|
setTimeout(async () => {
|
||||||
|
let allUrls = new Set();
|
||||||
|
|
||||||
|
let reqSubject: string = subjectMapping[subject];
|
||||||
|
if (reqSubject == null) {
|
||||||
|
reqSubject = subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
let url = resultsUrlPrefix + "&query=(" + "resultsubject exact" + " \"" + encodeURIComponent(reqSubject) + "\")" + "&type=" + resultsType;
|
||||||
|
|
||||||
|
let totalResults: number = 150000;
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
request.get(url+"&size=0&page=0", function (err: any, response: any) {
|
||||||
|
if (!response && err) {
|
||||||
|
reject(err);
|
||||||
|
} else {
|
||||||
|
totalResults = response.body['meta']['total'];
|
||||||
|
resolve(value => value);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}).catch(error => {
|
||||||
|
console.error("Error getting results ", error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
||||||
|
fs.appendFileSync("./" + errorFileName, error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "\n");
|
||||||
|
})
|
||||||
|
|
||||||
|
let pages: number = Math.ceil(totalResults/resultsPerUrl);
|
||||||
|
console.log("totalResults="+totalResults + " - pages="+pages);
|
||||||
|
for(let page=0; page<pages; page++) {
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||||||
|
|
||||||
|
promiseArray.push(
|
||||||
|
new Promise((resolve, reject) => {
|
||||||
|
request.get(url+"&size="+resultsPerUrl+"&page="+page, function (err: any, response: any) {
|
||||||
|
if (!response && err) {
|
||||||
|
reject(err);
|
||||||
|
} else {
|
||||||
|
parseAllUrls(response, allUrls, resultsType, subject);
|
||||||
|
resolve(value => value);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}).catch(error => {
|
||||||
|
console.error("Error getting results ", error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
||||||
|
fs.appendFileSync("./" + errorFileName, error);
|
||||||
|
fs.appendFileSync("./" + errorFileName, "\n");
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
finalize(resultsPerUrl, resultsType);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function get(resultsPerUrl, resultsType) {
|
function get(resultsPerUrl, resultsType) {
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
let searchFields = new SearchFields();
|
let searchFields = new SearchFields();
|
||||||
|
@ -82,24 +138,28 @@ function get(resultsPerUrl, resultsType) {
|
||||||
console.log("");
|
console.log("");
|
||||||
}
|
}
|
||||||
|
|
||||||
await Promise.all(promiseArray);
|
finalize(resultsPerUrl, resultsType);
|
||||||
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
|
||||||
console.log("\nNo indexed urls: "+noIndexedUrls + " vs final urls: "+finalUrls);
|
|
||||||
console.log("\nPublications: "+publications + " - Datasets: "+datasets +
|
|
||||||
" - Software: "+software + " - Other: "+other + " --- urls with pid: "+urlsWithPid);
|
|
||||||
|
|
||||||
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
|
||||||
if (err) {
|
|
||||||
return console.log("Error appending in file "+fileName+": ", err);
|
|
||||||
}
|
|
||||||
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
// });
|
// });
|
||||||
|
|
||||||
|
async function finalize(resultsPerUrl, resultsType) {
|
||||||
|
await Promise.all(promiseArray);
|
||||||
|
console.log("\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin);
|
||||||
|
console.log("\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls);
|
||||||
|
console.log("\nPublications: " + publications + " - Datasets: " + datasets +
|
||||||
|
" - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid);
|
||||||
|
|
||||||
|
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
||||||
|
if (err) {
|
||||||
|
return console.log("Error appending in file " + fileName + ": ", err);
|
||||||
|
}
|
||||||
|
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function parseAllUrls_old(response: any, allUrls: any) {
|
function parseAllUrls_old(response: any, allUrls: any) {
|
||||||
// let allUrls: any = [];
|
// let allUrls: any = [];
|
||||||
|
|
||||||
|
@ -157,7 +217,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
function parseAllUrls(response: any, allUrls: any, resultsType: string, subject: string = null) {
|
||||||
// let allUrls: any = [];
|
// let allUrls: any = [];
|
||||||
|
|
||||||
let responses: any = response.body['results'];
|
let responses: any = response.body['results'];
|
||||||
|
@ -210,7 +270,7 @@ function parseAllUrls(response: any, allUrls: any, resultsType: string) {
|
||||||
console.log("url to be added in file: "+url);
|
console.log("url to be added in file: "+url);
|
||||||
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
||||||
console.log("\n");
|
console.log("\n");
|
||||||
createSitemapFile(resultsType);
|
createSitemapFile(resultsType, subject);
|
||||||
}
|
}
|
||||||
|
|
||||||
finalUrls++;
|
finalUrls++;
|
||||||
|
@ -324,21 +384,25 @@ function getCommunities() {
|
||||||
}).catch(error => console.error("Error getting communities ", error));
|
}).catch(error => console.error("Error getting communities ", error));
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildSiteMap(resultsPerUrl, resultsType) {
|
function buildSiteMap(resultsPerUrl, resultsType, subject = null) {
|
||||||
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
||||||
|
|
||||||
let date = new Date();
|
let date = new Date();
|
||||||
createSitemapFile(resultsType);
|
createSitemapFile(resultsType, subject);
|
||||||
errorFileName = resultsType + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
errorFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||||
noIndexFileName = resultsType + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
noIndexFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
||||||
|
|
||||||
|
if(subject) {
|
||||||
|
getForSubject(resultsPerUrl, resultsType, subject);
|
||||||
|
} else {
|
||||||
getCommunities(); // comment out for communities query check
|
getCommunities(); // comment out for communities query check
|
||||||
get(resultsPerUrl, resultsType);
|
get(resultsPerUrl, resultsType);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function createSitemapFile(resultsType) {
|
async function createSitemapFile(resultsType, subject=null) {
|
||||||
// let date = new Date();
|
// let date = new Date();
|
||||||
fileName = resultsType + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
fileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
||||||
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
||||||
+ ".xml";//+"_"+date.getTime();
|
+ ".xml";//+"_"+date.getTime();
|
||||||
filesCreated++;
|
filesCreated++;
|
||||||
|
@ -349,6 +413,39 @@ async function createSitemapFile(resultsType) {
|
||||||
fs.writeFileSync("./" + fileName, sitemap);
|
fs.writeFileSync("./" + fileName, sitemap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function askQuestion(query) {
|
||||||
|
const readline = require('readline');
|
||||||
|
|
||||||
|
const rl = readline.createInterface({
|
||||||
|
input: process.stdin,
|
||||||
|
output: process.stdout,
|
||||||
|
});
|
||||||
|
|
||||||
|
return new Promise(resolve => rl.question(query, ans => {
|
||||||
|
rl.close();
|
||||||
|
resolve(ans);
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
async function start(resultsType, subject = null) {
|
||||||
|
if(resultsType == null) {
|
||||||
|
resultsType = await askQuestion("Please provide type of results (publications, datasets, software, other): ");
|
||||||
|
}
|
||||||
|
console.log("type is: " + resultsType);
|
||||||
|
|
||||||
|
if(subject == null) {
|
||||||
|
subject = await askQuestion("Please provide subject. " +
|
||||||
|
"Available subjects are \"Physics::Atomic Physics\" or physics, \"Mathematics::Combinatorics\" or mathematics, " +
|
||||||
|
"any other subject you want or no value if no subject: ");
|
||||||
|
if(!subject) {
|
||||||
|
subject = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log("subject is: " + subject);
|
||||||
|
|
||||||
|
buildSiteMap(200, resultsType, subject);
|
||||||
|
}
|
||||||
|
|
||||||
let filesCreated = 0;
|
let filesCreated = 0;
|
||||||
let fileName;
|
let fileName;
|
||||||
let errorFileName;
|
let errorFileName;
|
||||||
|
@ -379,5 +476,7 @@ const landingPrefix = "https://explore.openaire.eu/search/";
|
||||||
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
||||||
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
||||||
|
|
||||||
// process.argc[3] is the "resultType" argument
|
const subjectMapping = {"physics": "Physics::Atomic Physics", "mathematics": "Mathematics::Combinatorics"}
|
||||||
buildSiteMap(200, process.argv[3]);
|
|
||||||
|
// process.argc[3] is the "resultType" argument, process.argc[4] is the "subject" argument
|
||||||
|
start(process.argv[3], process.argv[4]);
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
npx ts-node extractUrlsFromSearch.ts -- $1
|
npx ts-node extractUrlsFromSearch.ts -- $1 $2
|
||||||
|
|
Loading…
Reference in New Issue