@ -13,6 +13,62 @@ import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/ent
const request = require ( 'superagent' ) ;
const request = require ( 'superagent' ) ;
function getForSubject ( resultsPerUrl , resultsType , subject ) {
setTimeout ( async ( ) = > {
let allUrls = new Set ( ) ;
let reqSubject : string = subjectMapping [ subject ] ;
if ( reqSubject == null ) {
reqSubject = subject ;
}
let url = resultsUrlPrefix + "&query=(" + "resultsubject exact" + " \"" + encodeURIComponent ( reqSubject ) + "\")" + "&type=" + resultsType ;
let totalResults : number = 150000 ;
await new Promise ( ( resolve , reject ) = > {
request . get ( url + "&size=0&page=0" , function ( err : any , response : any ) {
if ( ! response && err ) {
reject ( err ) ;
} else {
totalResults = response . body [ 'meta' ] [ 'total' ] ;
resolve ( value = > value ) ;
}
} )
} ) . catch ( error = > {
console . error ( "Error getting results " , error ) ;
fs . appendFileSync ( "./" + errorFileName , "no response " + url + " " ) ;
fs . appendFileSync ( "./" + errorFileName , error ) ;
fs . appendFileSync ( "./" + errorFileName , "\n" ) ;
} )
let pages : number = Math . ceil ( totalResults / resultsPerUrl ) ;
console . log ( "totalResults=" + totalResults + " - pages=" + pages ) ;
for ( let page = 0 ; page < pages ; page + + ) {
await new Promise ( resolve = > setTimeout ( resolve , 500 ) ) ;
promiseArray . push (
new Promise ( ( resolve , reject ) = > {
request . get ( url + "&size=" + resultsPerUrl + "&page=" + page , function ( err : any , response : any ) {
if ( ! response && err ) {
reject ( err ) ;
} else {
parseAllUrls ( response , allUrls , resultsType , subject ) ;
resolve ( value = > value ) ;
}
} )
} ) . catch ( error = > {
console . error ( "Error getting results " , error ) ;
fs . appendFileSync ( "./" + errorFileName , "no response " + url + " " ) ;
fs . appendFileSync ( "./" + errorFileName , error ) ;
fs . appendFileSync ( "./" + errorFileName , "\n" ) ;
} )
) ;
}
finalize ( resultsPerUrl , resultsType ) ;
} ) ;
}
function get ( resultsPerUrl , resultsType ) {
function get ( resultsPerUrl , resultsType ) {
setTimeout ( ( ) = > {
setTimeout ( ( ) = > {
let searchFields = new SearchFields ( ) ;
let searchFields = new SearchFields ( ) ;
@ -82,24 +138,28 @@ function get(resultsPerUrl, resultsType) {
console . log ( "" ) ;
console . log ( "" ) ;
}
}
await Promise . all ( promiseArray ) ;
finalize ( resultsPerUrl , resultsType ) ;
console . log ( "\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin ) ;
console . log ( "\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls ) ;
console . log ( "\nPublications: " + publications + " - Datasets: " + datasets +
" - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid ) ;
fs . appendFile ( "./" + fileName , "\n</urlset>" , function ( err ) {
if ( err ) {
return console . log ( "Error appending in file " + fileName + ": " , err ) ;
}
console . timeEnd ( "total_time (" + resultsPerUrl + " " + resultsType + " per request)" ) ;
} ) ;
}
}
} )
} )
} )
} )
}
}
// });
// });
async function finalize ( resultsPerUrl , resultsType ) {
await Promise . all ( promiseArray ) ;
console . log ( "\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin ) ;
console . log ( "\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls ) ;
console . log ( "\nPublications: " + publications + " - Datasets: " + datasets +
" - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid ) ;
fs . appendFile ( "./" + fileName , "\n</urlset>" , function ( err ) {
if ( err ) {
return console . log ( "Error appending in file " + fileName + ": " , err ) ;
}
console . timeEnd ( "total_time (" + resultsPerUrl + " " + resultsType + " per request)" ) ;
} ) ;
}
function parseAllUrls_old ( response : any , allUrls : any ) {
function parseAllUrls_old ( response : any , allUrls : any ) {
// let allUrls: any = [];
// let allUrls: any = [];
@ -157,7 +217,7 @@ function parseAllUrls_old(response: any, allUrls: any) {
}
}
function parseAllUrls ( response : any , allUrls : any , resultsType : string ) {
function parseAllUrls ( response : any , allUrls : any , resultsType : string , subject : string = null ) {
// let allUrls: any = [];
// let allUrls: any = [];
let responses : any = response . body [ 'results' ] ;
let responses : any = response . body [ 'results' ] ;
@ -210,7 +270,7 @@ function parseAllUrls(response: any, allUrls: any, resultsType: string) {
console . log ( "url to be added in file: " + url ) ;
console . log ( "url to be added in file: " + url ) ;
fs . appendFileSync ( "./" + fileName , "\n</urlset>" ) ;
fs . appendFileSync ( "./" + fileName , "\n</urlset>" ) ;
console . log ( "\n" ) ;
console . log ( "\n" ) ;
createSitemapFile ( resultsType );
createSitemapFile ( resultsType , subject );
}
}
finalUrls ++ ;
finalUrls ++ ;
@ -324,21 +384,25 @@ function getCommunities() {
} ) . catch ( error = > console . error ( "Error getting communities " , error ) ) ;
} ) . catch ( error = > console . error ( "Error getting communities " , error ) ) ;
}
}
function buildSiteMap ( resultsPerUrl , resultsType ) {
function buildSiteMap ( resultsPerUrl , resultsType , subject = null ) {
console . time ( "total_time (" + resultsPerUrl + " " + resultsType + " per request)" ) ;
console . time ( "total_time (" + resultsPerUrl + " " + resultsType + " per request)" ) ;
let date = new Date ( ) ;
let date = new Date ( ) ;
createSitemapFile ( resultsType );
createSitemapFile ( resultsType , subject );
errorFileName = resultsType + "_error_" + date . getFullYear ( ) + "_" + ( date . getMonth ( ) + 1 ) + "_" + date . getDate ( ) + ".txt" ; //+"_"+date.getTime();
errorFileName = resultsType + ( subject ? "_" + subject . replace ( /\s/g , "" ) : "" ) + "_error_" + date . getFullYear ( ) + "_" + ( date . getMonth ( ) + 1 ) + "_" + date . getDate ( ) + ".txt" ; //+"_"+date.getTime();
noIndexFileName = resultsType + "_noIndex_" + date . getFullYear ( ) + "_" + ( date . getMonth ( ) + 1 ) + "_" + date . getDate ( ) + ".txt" ; //+"_"+date.getTime();
noIndexFileName = resultsType + ( subject ? "_" + subject . replace ( /\s/g , "" ) : "" ) + "_noIndex_" + date . getFullYear ( ) + "_" + ( date . getMonth ( ) + 1 ) + "_" + date . getDate ( ) + ".txt" ; //+"_"+date.getTime();
getCommunities ( ) ; // comment out for communities query check
if ( subject ) {
get ( resultsPerUrl , resultsType ) ;
getForSubject ( resultsPerUrl , resultsType , subject ) ;
} else {
getCommunities ( ) ; // comment out for communities query check
get ( resultsPerUrl , resultsType ) ;
}
}
}
async function createSitemapFile ( resultsType ) {
async function createSitemapFile ( resultsType , subject = null ) {
// let date = new Date();
// let date = new Date();
fileName = resultsType + "_sitemap" + ( filesCreated > 0 ? filesCreated : "" )
fileName = resultsType + ( subject ? "_" + subject . replace ( /\s/g , "" ) : "" ) + "_sitemap" + ( filesCreated > 0 ? filesCreated : "" )
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
+ ".xml" ; //+"_"+date.getTime();
+ ".xml" ; //+"_"+date.getTime();
filesCreated ++ ;
filesCreated ++ ;
@ -349,6 +413,39 @@ async function createSitemapFile(resultsType) {
fs . writeFileSync ( "./" + fileName , sitemap ) ;
fs . writeFileSync ( "./" + fileName , sitemap ) ;
}
}
function askQuestion ( query ) {
const readline = require ( 'readline' ) ;
const rl = readline . createInterface ( {
input : process.stdin ,
output : process.stdout ,
} ) ;
return new Promise ( resolve = > rl . question ( query , ans = > {
rl . close ( ) ;
resolve ( ans ) ;
} ) )
}
async function start ( resultsType , subject = null ) {
if ( resultsType == null ) {
resultsType = await askQuestion ( "Please provide type of results (publications, datasets, software, other): " ) ;
}
console . log ( "type is: " + resultsType ) ;
if ( subject == null ) {
subject = await askQuestion ( "Please provide subject. " +
"Available subjects are \"Physics::Atomic Physics\" or physics, \"Mathematics::Combinatorics\" or mathematics, " +
"any other subject you want or no value if no subject: " ) ;
if ( ! subject ) {
subject = null ;
}
}
console . log ( "subject is: " + subject ) ;
buildSiteMap ( 200 , resultsType , subject ) ;
}
let filesCreated = 0 ;
let filesCreated = 0 ;
let fileName ;
let fileName ;
let errorFileName ;
let errorFileName ;
@ -379,5 +476,7 @@ const landingPrefix = "https://explore.openaire.eu/search/";
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=" ;
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=" ;
const contextUrl = "https://services.openaire.eu/openaire/contexts/" ;
const contextUrl = "https://services.openaire.eu/openaire/contexts/" ;
// process.argc[3] is the "resultType" argument
const subjectMapping = { "physics" : "Physics::Atomic Physics" , "mathematics" : "Mathematics::Combinatorics" }
buildSiteMap ( 200 , process . argv [ 3 ] ) ;
// process.argc[3] is the "resultType" argument, process.argc[4] is the "subject" argument
start ( process . argv [ 3 ] , process . argv [ 4 ] ) ;