Regex support
parent
18e8b009a0
commit
c13b86c031
@ -0,0 +1,328 @@
|
||||
# coding: utf-8
|
||||
import setpath
|
||||
import re
|
||||
from lib import porter2 as porter
|
||||
import functions
|
||||
import unicodedata
|
||||
import itertools
|
||||
|
||||
# Increase regular expression cache
|
||||
try:
|
||||
re._MAXCACHE = 1000
|
||||
except:
|
||||
pass
|
||||
|
||||
# Every regular expression containing \W \w \D \d \b \S \s needs to be compiled
|
||||
# like below. If you want to embed the UNICODE directive inside the
|
||||
# regular expression use:
|
||||
# (?u) like re.sub(ur'(?u)[\W\d]', ' ', o)
|
||||
delete_numbers_and_non_letters=re.compile(ur'[\W]',re.UNICODE)
|
||||
delete_non_letters=re.compile(ur'[\W]',re.UNICODE)
|
||||
delete_word_all=re.compile(ur'\w+\sall',re.UNICODE)
|
||||
delete_word_all_and_or=re.compile(ur'\w+\sall\s(?:and|or)',re.UNICODE)
|
||||
text_tokens = re.compile(ur'([\d.]+\b|\w+|\$[\d.]+)', re.UNICODE)
|
||||
strip_remove_newlines=re.compile(u'(?:\\s+$|^\\s+|(?<=[^\\s\\d\\w.;,!?])\n+)', re.UNICODE)
|
||||
reduce_spaces=re.compile(ur'\s+', re.UNICODE)
|
||||
cqlterms=('title', 'subject', 'person', 'enter', 'creator', 'isbn')
|
||||
replchars = re.compile(r'[\n\r]')
|
||||
|
||||
|
||||
def textacknowledgments(txt,span = 10,maxlen = 3,pattern = r'(?:support)|(?:thank)|(?:in part)|(?:research)|(?:\bwork\b)|(?:\bgrants?\b)|(?:project)|(?:science)|(?:fund)|(?:nation)|(?:author)|(?:foundation)|(?:\bprogram\b)|(?:\bhelp\b)|(?:paper)|(?:technolog)|(?:partial)|(?:acknowledg)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:study)|(?:discuss)|(?:particip)|(?:ministry)|(?:contribut)|(?:european)|(?:number)|(?:valuabl)|(?:education)|(?:council)|(?:award)|(?:contract)|(?:institut)' ):
|
||||
"""
|
||||
.. function:: textacknowledgments(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
|
||||
|
||||
Returns the "Reference" section of documents. To find it, it searches for parts of the document that
|
||||
have a high density of pattern matches.
|
||||
|
||||
.. parameters:: txt,maxlen,pattern
|
||||
txt: input text.
|
||||
span: the size of the string in words that the txt is splited
|
||||
maxlen: the size of the scrolling window over the text in which the density is calculated.
|
||||
pattern: regular expression that is matched against the lines of the text. By default the pattern matches
|
||||
year occurences so as to extract sections that look like references.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
>>> sql("select textacknowledgments('')")
|
||||
textacknowledgments('')
|
||||
------------------
|
||||
<BLANKLINE>
|
||||
"""
|
||||
|
||||
exp = re.sub('\r\n','\n',txt)
|
||||
exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))
|
||||
|
||||
if exp.count(' ') < span * 10:
|
||||
return exp
|
||||
|
||||
acknowledgments = []
|
||||
origwords = exp.split(' ')
|
||||
words = exp.lower()
|
||||
words = words.split(' ')
|
||||
stemed = []
|
||||
# for k in words:
|
||||
# if len(k) > 0:
|
||||
# stemed.append(porter.stem(k))
|
||||
spanedorigtext = [' '.join(origwords[i:i+span]) for i in range(0, len(origwords), span)]
|
||||
spanedstemtext = [' '.join(words[i:i+span]) for i in range(0, len(words), span)]
|
||||
reversedtext = iter(spanedstemtext)
|
||||
results = []
|
||||
densities = []
|
||||
|
||||
for i in xrange(maxlen/2):
|
||||
results.append(0)
|
||||
for i in reversedtext:
|
||||
count = sum(1 for m in re.finditer(pattern, i))
|
||||
if count:
|
||||
results.append(count)
|
||||
else:
|
||||
results.append(0)
|
||||
|
||||
for i in xrange(maxlen/2):
|
||||
results.append(0)
|
||||
|
||||
#print len(spanedorigtext), len(spanedstemtext), len(results), len(results)-maxlen/2 - maxlen/2
|
||||
|
||||
out = 0
|
||||
temp = 0
|
||||
for i in xrange(maxlen/2,len(results)-maxlen/2):
|
||||
densities.append(sum(results[i-maxlen/2:i-maxlen/2+maxlen])*1.0/maxlen)
|
||||
|
||||
# for cnt, i in enumerate(spanedorigtext):
|
||||
# print i, results[maxlen/2+cnt], densities[cnt]
|
||||
|
||||
threshold = 1
|
||||
|
||||
paragraphsum = []
|
||||
paragraphs = []
|
||||
prev = -10
|
||||
current = 0
|
||||
maxsum = 0
|
||||
maxi = 0
|
||||
for line in spanedorigtext:
|
||||
if densities[current] > threshold:
|
||||
# new paragraph first visit
|
||||
if (prev+1) != current:
|
||||
paragraphsum.append(0)
|
||||
paragraphs.append([])
|
||||
paragraphsum[-1] += results[maxlen/2+current]
|
||||
paragraphs[-1].append(line)
|
||||
prev = current
|
||||
current += 1
|
||||
|
||||
for cnt, paragraph in enumerate(paragraphs):
|
||||
if paragraphsum[cnt] > maxsum:
|
||||
maxsum = paragraphsum[cnt]
|
||||
maxi = cnt
|
||||
# print '\n'.join(paragraph), paragraphsum[cnt], '\n'
|
||||
# print '!!!!!!!!', maxsum, maxi
|
||||
|
||||
paragraphsum.append(0)
|
||||
paragraphs.append([])
|
||||
if paragraphsum[maxi] > 2:
|
||||
return '\n'.join(paragraphs[maxi])
|
||||
#return ('\n'.join(paragraphs[maxi]))+" "+str(paragraphsum[maxi])
|
||||
else:
|
||||
return ''
|
||||
|
||||
textacknowledgments.registered=True
|
||||
|
||||
|
||||
def textacknowledgmentsstem(txt,span = 10,maxlen = 3,pattern = r'(?:support)|(?:thank)|(?:research)|(?:\bwork\b)|(?:\bgrant\b)|(?:project)|(?:scienc)|(?:\bfund\b)|(?:nation)|(?:author)|(?:foundat)|(?:\bprogram\b)|(?:\bhelp\b)|(?:univers)|(?:paper)|(?:technolog)|(?:partial)|(?:comment)|(?:develop)|(?:acknowledg)|(?:review)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:studi)|(?:discuss)|(?:particip)|(?:ministri)|(?:contribut)|(?:european)|(?:system)|(?:comput)|(?:number)|(?:valuabl)|(?:educ)|(?:council)|(?:award)|(?:contract)|(?:inform)|(?:institut)' ):
|
||||
"""
|
||||
.. function:: textacknowledgmentsstem(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
|
||||
|
||||
Returns the "Reference" section of documents. To find it, it searches for parts of the document that
|
||||
have a high density of pattern matches.
|
||||
|
||||
.. parameters:: txt,maxlen,pattern
|
||||
txt: input text.
|
||||
span: the size of the string in words that the txt is splited
|
||||
maxlen: the size of the scrolling window over the text in which the density is calculated.
|
||||
pattern: regular expression that is matched against the lines of the text. By default the pattern matches
|
||||
year occurences so as to extract sections that look like references.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
>>> sql("select textacknowledgmentsstem('')")
|
||||
textacknowledgmentsstem('')
|
||||
------------------
|
||||
<BLANKLINE>
|
||||
"""
|
||||
|
||||
exp = re.sub('\r\n','\n',txt)
|
||||
exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))
|
||||
|
||||
if exp.count(' ') < span * 10:
|
||||
return exp
|
||||
|
||||
acknowledgments = []
|
||||
origwords = exp.split(' ')
|
||||
words = exp.lower()
|
||||
words = words.split(' ')
|
||||
stemed = []
|
||||
for k in words:
|
||||
if len(k) > 0:
|
||||
try:
|
||||
stemed.append(porter.stem(k))
|
||||
except Exception:
|
||||
stemed.append(k)
|
||||
spanedorigtext = [' '.join(origwords[i:i+span]) for i in range(0, len(origwords), span)]
|
||||
spanedstemtext = [' '.join(stemed[i:i+span]) for i in range(0, len(stemed), span)]
|
||||
reversedtext = iter(spanedstemtext)
|
||||
results = []
|
||||
densities = []
|
||||
|
||||
for i in xrange(maxlen/2):
|
||||
results.append(0)
|
||||
for i in reversedtext:
|
||||
count = sum(1 for m in re.finditer(pattern, i))
|
||||
if count:
|
||||
results.append(count)
|
||||
else:
|
||||
results.append(0)
|
||||
|
||||
for i in xrange(maxlen/2):
|
||||
results.append(0)
|
||||
|
||||
for i in xrange(maxlen/2,len(results)-maxlen/2):
|
||||
densities.append(sum(results[i-maxlen/2:i-maxlen/2+maxlen])*1.0/maxlen)
|
||||
|
||||
threshold = 1
|
||||
|
||||
current = 0
|
||||
for i in spanedorigtext:
|
||||
if len(i)>10:
|
||||
if densities[current] > threshold:
|
||||
acknowledgments.append(i)
|
||||
current+=1
|
||||
return '\n'.join(acknowledgments)
|
||||
|
||||
textacknowledgmentsstem.registered=True
|
||||
|
||||
# without tara: pattern=r'(?:\bthank)|(?:\barticl)|(?:\bpmc\b)|(?:\bsupport)|(?:\bsampl)|(?:\bexpedit)|(?:\bfoundat)|(?:\bresearch)|(?:\bhelp)|(?:\binstitut)|(?:\bmarin)|(?:\bnation)|(?:\backnowledg)|(?:\bcomment)|(?:\bcontribut)|(?:\bfund)|(?:\bgrate)|(?:\bprovid)|(?:\bproject)|(?:\bpossibl)|(?:\bscienc)|(?:author)|(?:grant)|(?:fellowship)|(?:program)|(?:programm)|(?:suggest)|(?:taraexpedit)|(?:université)|(?:valuabl)|(?:without)|(?:pmc articles)|(?:oceans expedition)|(?:oceans consortium)|(?:anonymous reviewers)|(?:article contribution)|(?:environment foundation)|(?:people sponsors)|(?:projects? poseidon)|(?:wish thank)|(?:commitment following)|(?:continuous support)|(?:data analysis)|(?:exist without)|(?:tara girus)|(?:keen thank)|(?:oceans taraexpeditions)|(?:possible thanks)|(?:sponsors made)|(?:technical assistance)|(?:thank commitment)|(?:without continuous)'
|
||||
# with tara: pattern=r'(?:\bthank)|(?:\btara\b)|(?:\barticl)|(?:\bocean\b)|(?:\bpmc\b)|(?:\bsupport)|(?:\bsampl)|(?:\bexpedit)|(?:\bfoundat)|(?:\bresearch)|(?:\bhelp)|(?:\binstitut)|(?:\bmarin)|(?:\bnation)|(?:\backnowledg)|(?:\bcomment)|(?:\bcontribut)|(?:\bfund)|(?:\bgrate)|(?:\bprovid)|(?:\bproject)|(?:\bpossibl)|(?:\bscienc)|(?:author)|(?:grant)|(?:fellowship)|(?:program)|(?:programm)|(?:suggest)|(?:taraexpedit)|(?:université)|(?:valuabl)|(?:without)|(?:pmc articles)|(?:tara oceans)|(?:oceans expedition)|(?:oceans consortium)|(?:anonymous reviewers)|(?:article contribution)|(?:environment foundation)|(?:people sponsors)|(?:projects? poseidon)|(?:wish thank)|(?:commitment following)|(?:continuous support)|(?:data analysis)|(?:exist without)|(?:tara girus)|(?:tara schooner)|(?:keen thank)|(?:oceans taraexpeditions)|(?:possible thanks)|(?:sponsors made)|(?:technical assistance)|(?:thank commitment)|(?:without continuous)'
|
||||
# with tara v1: pattern=r'(?:\bthank)|(?:\bpmc\b)|(?:\bsupport)|(?:\bsampl)|(?:\bfoundat)|(?:\bresearch)|(?:\bhelp)|(?:\binstitut)|(?:\bnation)|(?:\backnowledg)|(?:\bcomment)|(?:\bcontribut)|(?:\bfund)|(?:\bgrate)|(?:\bprovid)|(?:\bproject)|(?:\bpossibl)|(?:\bscienc)|(?:author)|(?:grant)|(?:fellowship)|(?:program)|(?:suggest)|(?:université)|(?:valuabl)|(?:without)|(?:pmc articles)|(?:oceans consortium)|(?:anonymous reviewers)|(?:article contribution)|(?:environment foundation)|(?:people sponsors)|(?:projects? poseidon)|(?:wish thank)|(?:commitment following)|(?:continuous support)|(?:data analysis)|(?:exist without)|(?:keen thank)|(?:possible thanks)|(?:sponsors made)|(?:technical assistance)|(?:thank commitment)|(?:without continuous)'
|
||||
def textacknowledgmentstara(txt, span=20, maxlen=7,
|
||||
pattern=r'(?:crew)|(?:research)|(?:acknowledgements)|(?:acknowledge)|(?:acknowledg)|(?:assistance)|(?:commitment of)|(?:comments)|(?:particular(?:ly)?)|(?:fellowships?)|(?:authors?)|(?:program(?:s|mmes?)?)|(?:projects?)|(?:institutes?)|(?:sponsors)|(?:\bthanks?\b)|(?:possible)|(?:\bgrant(?:ed|s)?)|(?:\bsampl(?:e[sd]?|ing))|(?:\bsupport(?:ing)?\b)|(?:foundation)|(?:expedition)|(?:anr-)|(?:\bthis work\b)|(?:\bfunded by\b)|(?:\bthis study\b)|(?:following (?:people|individuals|institutions?|organizations?|sponsors))|(?:contribution (?:no|number))|(?:\bwish thanks?\b)|(?:\b23 institutes\b)|(?:\bgrateful(?: in| to| for|ly))|(?:supported (?:by|in))|(?:continuous support)|(?:exist without)|(?:following people)|(?:without continuous support)|(?:part of the)'):
|
||||
"""
|
||||
.. function:: textacknowledgments(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
|
||||
|
||||
Returns the "Reference" section of documents. To find it, it searches for parts of the document that
|
||||
have a high density of pattern matches.
|
||||
|
||||
.. parameters:: txt,maxlen,pattern
|
||||
txt: input text.
|
||||
span: the size of the string in words that the txt is splited
|
||||
maxlen: the size of the scrolling window over the text in which the density is calculated.
|
||||
pattern: regular expression that is matched against the lines of the text. By default the pattern matches
|
||||
year occurences so as to extract sections that look like references.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
>>> sql("select textacknowledgments('')")
|
||||
textacknowledgments('')
|
||||
------------------
|
||||
<BLANKLINE>
|
||||
"""
|
||||
|
||||
# clean text from \r\n
|
||||
exp = re.sub('\r\n', '\n', txt)
|
||||
# dedublicate spaces
|
||||
exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))
|
||||
|
||||
# if text is small, return it
|
||||
if exp.count(' ') < span * 10:
|
||||
return exp
|
||||
|
||||
acknowledgments = []
|
||||
origwords = exp.split(' ')
|
||||
words = exp.lower()
|
||||
words = words.split(' ')
|
||||
stemed = []
|
||||
# for k in words:
|
||||
# if len(k) > 0:
|
||||
# stemed.append(porter.stem(k))
|
||||
spanedorigtext = [' '.join(origwords[i:i + span]) for i in range(0, len(origwords), span)]
|
||||
spanedstemtext = [' '.join(words[i:i + span]) for i in range(0, len(words), span)]
|
||||
reversedtext = iter(spanedstemtext)
|
||||
results = []
|
||||
densities = []
|
||||
|
||||
for i in xrange(maxlen / 2):
|
||||
results.append(0)
|
||||
for i in reversedtext:
|
||||
count = sum(1 for m in re.finditer(pattern, i))
|
||||
if count:
|
||||
results.append(count)
|
||||
else:
|
||||
results.append(0)
|
||||
|
||||
for i in xrange(maxlen / 2):
|
||||
results.append(0)
|
||||
|
||||
# print len(spanedorigtext), len(spanedstemtext), len(results), len(results)-maxlen/2 - maxlen/2
|
||||
|
||||
out = 0
|
||||
temp = 0
|
||||
for i in xrange(maxlen / 2, len(results) - maxlen / 2):
|
||||
densities.append(sum(results[i - maxlen / 2:i - maxlen / 2 + maxlen]) * 1.0 / maxlen)
|
||||
|
||||
# for cnt, i in enumerate(spanedorigtext):
|
||||
# print results[maxlen/2+cnt], densities[cnt], i
|
||||
|
||||
threshold = 1
|
||||
|
||||
paragraphsum = []
|
||||
paragraphs = []
|
||||
prev = -10
|
||||
current = 0
|
||||
maxsum = 0
|
||||
maxi = 0
|
||||
for line in spanedorigtext:
|
||||
if densities[current] > threshold:
|
||||
# new paragraph first visit
|
||||
if (prev + 1) != current:
|
||||
paragraphsum.append(0)
|
||||
paragraphs.append([])
|
||||
paragraphsum[-1] += results[maxlen / 2 + current]
|
||||
paragraphs[-1].append(line)
|
||||
prev = current
|
||||
current += 1
|
||||
|
||||
for cnt, paragraph in enumerate(paragraphs):
|
||||
if paragraphsum[cnt] > maxsum:
|
||||
maxsum = paragraphsum[cnt]
|
||||
maxi = cnt
|
||||
# print '\n'.join(paragraph), paragraphsum[cnt], '\n'
|
||||
# print '!!!!!!!!', maxsum, maxi
|
||||
|
||||
paragraphsum.append(0)
|
||||
paragraphs.append([])
|
||||
if paragraphsum[maxi] > 2:
|
||||
return '\n'.join(paragraphs[maxi])
|
||||
# return ('\n'.join(paragraphs[maxi]))+" "+str(paragraphsum[maxi])
|
||||
else:
|
||||
return ''
|
||||
|
||||
|
||||
textacknowledgmentstara.registered = True
|
||||
|
||||
|
||||
if not ('.' in __name__):
|
||||
"""
|
||||
This is needed to be able to test the function, put it at the end of every
|
||||
new function you create
|
||||
"""
|
||||
import sys
|
||||
import setpath
|
||||
from functions import *
|
||||
testfunction()
|
||||
if __name__ == "__main__":
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
import doctest
|
||||
doctest.testmod()
|
@ -0,0 +1,6 @@
|
||||
.uk-accordion-title::after {
|
||||
background-image: url(data:image/svg+xml;charset=utf-8,%3Csvg%20width%3D%2220%22%20height%3D%2220%22%20viewBox%3D%220%200%2020%2020%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%20%3Cpolyline%20fill%3D%22none%22%20stroke%3D%22%23000%22%20stroke-width%3D%221.03%22%20points%3D%2216%207%2010%2013%204%207%22%3E%3C%2Fpolyline%3E%3C%2Fsvg%3E);
|
||||
}
|
||||
.uk-open>.uk-accordion-title::after {
|
||||
background-image: url(data:image/svg+xml;charset=utf-8,%3Csvg%20width%3D%2220%22%20height%3D%2220%22%20viewBox%3D%220%200%2020%2020%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%20%3Cpolyline%20fill%3D%22none%22%20stroke%3D%22%23000%22%20stroke-width%3D%221.03%22%20points%3D%224%2013%2010%207%2016%2013%22%3E%3C%2Fpolyline%3E%3C%2Fsvg%3E);
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
.uk-accordion-title::after {
|
||||
background-image: url(data:image/svg+xml;charset=utf-8,%3Csvg%20width%3D%2220%22%20height%3D%2220%22%20viewBox%3D%220%200%2020%2020%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%20%3Cpolyline%20fill%3D%22none%22%20stroke%3D%22%23000%22%20stroke-width%3D%221.03%22%20points%3D%2216%207%2010%2013%204%207%22%3E%3C%2Fpolyline%3E%3C%2Fsvg%3E);
|
||||
}
|
||||
.uk-open>.uk-accordion-title::after {
|
||||
background-image: url(data:image/svg+xml;charset=utf-8,%3Csvg%20width%3D%2220%22%20height%3D%2220%22%20viewBox%3D%220%200%2020%2020%22%20xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F2000%2Fsvg%22%3E%20%3Cpolyline%20fill%3D%22none%22%20stroke%3D%22%23000%22%20stroke-width%3D%221.03%22%20points%3D%224%2013%2010%207%2016%2013%22%3E%3C%2Fpolyline%3E%3C%2Fsvg%3E);
|
||||
}
|
@ -1,3 +1,4 @@
|
||||
export const environment = {
|
||||
production: true
|
||||
production: true,
|
||||
miningbackendserveraddress: 'https://beta.services.openaire.eu/interactive-mining'
|
||||
};
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue