interactive-mining/interactive-mining-3rdparty.../madis/src/functionslocal/row/textacknowledgments.py

329 lines
14 KiB
Python
Executable File

# coding: utf-8
import setpath
import re
from lib import porter2 as porter
import functions
import unicodedata
import itertools
# Increase regular expression cache
try:
re._MAXCACHE = 1000
except:
pass
# Every regular expression containing \W \w \D \d \b \S \s needs to be compiled
# like below. If you want to embed the UNICODE directive inside the
# regular expression use:
# (?u) like re.sub(ur'(?u)[\W\d]', ' ', o)
delete_numbers_and_non_letters=re.compile(ur'[\W]',re.UNICODE)
delete_non_letters=re.compile(ur'[\W]',re.UNICODE)
delete_word_all=re.compile(ur'\w+\sall',re.UNICODE)
delete_word_all_and_or=re.compile(ur'\w+\sall\s(?:and|or)',re.UNICODE)
text_tokens = re.compile(ur'([\d.]+\b|\w+|\$[\d.]+)', re.UNICODE)
strip_remove_newlines=re.compile(u'(?:\\s+$|^\\s+|(?<=[^\\s\\d\\w.;,!?])\n+)', re.UNICODE)
reduce_spaces=re.compile(ur'\s+', re.UNICODE)
cqlterms=('title', 'subject', 'person', 'enter', 'creator', 'isbn')
replchars = re.compile(r'[\n\r]')
def textacknowledgments(txt,span = 10,maxlen = 3,pattern = r'(?:support)|(?:thank)|(?:in part)|(?:research)|(?:\bwork\b)|(?:\bgrants?\b)|(?:project)|(?:science)|(?:fund)|(?:nation)|(?:author)|(?:foundation)|(?:\bprogram\b)|(?:\bhelp\b)|(?:paper)|(?:technolog)|(?:partial)|(?:acknowledg)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:study)|(?:discuss)|(?:particip)|(?:ministry)|(?:contribut)|(?:european)|(?:number)|(?:valuabl)|(?:education)|(?:council)|(?:award)|(?:contract)|(?:institut)' ):
"""
.. function:: textacknowledgments(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
Returns the "Reference" section of documents. To find it, it searches for parts of the document that
have a high density of pattern matches.
.. parameters:: txt,maxlen,pattern
txt: input text.
span: the size of the string in words that the txt is splited
maxlen: the size of the scrolling window over the text in which the density is calculated.
pattern: regular expression that is matched against the lines of the text. By default the pattern matches
year occurences so as to extract sections that look like references.
Examples:
>>> sql("select textacknowledgments('')")
textacknowledgments('')
------------------
<BLANKLINE>
"""
exp = re.sub('\r\n','\n',txt)
exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))
if exp.count(' ') < span * 10:
return exp
acknowledgments = []
origwords = exp.split(' ')
words = exp.lower()
words = words.split(' ')
stemed = []
# for k in words:
# if len(k) > 0:
# stemed.append(porter.stem(k))
spanedorigtext = [' '.join(origwords[i:i+span]) for i in range(0, len(origwords), span)]
spanedstemtext = [' '.join(words[i:i+span]) for i in range(0, len(words), span)]
reversedtext = iter(spanedstemtext)
results = []
densities = []
for i in xrange(maxlen/2):
results.append(0)
for i in reversedtext:
count = sum(1 for m in re.finditer(pattern, i))
if count:
results.append(count)
else:
results.append(0)
for i in xrange(maxlen/2):
results.append(0)
#print len(spanedorigtext), len(spanedstemtext), len(results), len(results)-maxlen/2 - maxlen/2
out = 0
temp = 0
for i in xrange(maxlen/2,len(results)-maxlen/2):
densities.append(sum(results[i-maxlen/2:i-maxlen/2+maxlen])*1.0/maxlen)
# for cnt, i in enumerate(spanedorigtext):
# print i, results[maxlen/2+cnt], densities[cnt]
threshold = 1
paragraphsum = []
paragraphs = []
prev = -10
current = 0
maxsum = 0
maxi = 0
for line in spanedorigtext:
if densities[current] > threshold:
# new paragraph first visit
if (prev+1) != current:
paragraphsum.append(0)
paragraphs.append([])
paragraphsum[-1] += results[maxlen/2+current]
paragraphs[-1].append(line)
prev = current
current += 1
for cnt, paragraph in enumerate(paragraphs):
if paragraphsum[cnt] > maxsum:
maxsum = paragraphsum[cnt]
maxi = cnt
# print '\n'.join(paragraph), paragraphsum[cnt], '\n'
# print '!!!!!!!!', maxsum, maxi
paragraphsum.append(0)
paragraphs.append([])
if paragraphsum[maxi] > 2:
return '\n'.join(paragraphs[maxi])
#return ('\n'.join(paragraphs[maxi]))+" "+str(paragraphsum[maxi])
else:
return ''
textacknowledgments.registered=True
def textacknowledgmentsstem(txt,span = 10,maxlen = 3,pattern = r'(?:support)|(?:thank)|(?:research)|(?:\bwork\b)|(?:\bgrant\b)|(?:project)|(?:scienc)|(?:\bfund\b)|(?:nation)|(?:author)|(?:foundat)|(?:\bprogram\b)|(?:\bhelp\b)|(?:univers)|(?:paper)|(?:technolog)|(?:partial)|(?:comment)|(?:develop)|(?:acknowledg)|(?:review)|(?:provid)|(?:grate)|(?:\bcenter\b)|(?:studi)|(?:discuss)|(?:particip)|(?:ministri)|(?:contribut)|(?:european)|(?:system)|(?:comput)|(?:number)|(?:valuabl)|(?:educ)|(?:council)|(?:award)|(?:contract)|(?:inform)|(?:institut)' ):
"""
.. function:: textacknowledgmentsstem(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
Returns the "Reference" section of documents. To find it, it searches for parts of the document that
have a high density of pattern matches.
.. parameters:: txt,maxlen,pattern
txt: input text.
span: the size of the string in words that the txt is splited
maxlen: the size of the scrolling window over the text in which the density is calculated.
pattern: regular expression that is matched against the lines of the text. By default the pattern matches
year occurences so as to extract sections that look like references.
Examples:
>>> sql("select textacknowledgmentsstem('')")
textacknowledgmentsstem('')
------------------
<BLANKLINE>
"""
exp = re.sub('\r\n','\n',txt)
exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))
if exp.count(' ') < span * 10:
return exp
acknowledgments = []
origwords = exp.split(' ')
words = exp.lower()
words = words.split(' ')
stemed = []
for k in words:
if len(k) > 0:
try:
stemed.append(porter.stem(k))
except Exception:
stemed.append(k)
spanedorigtext = [' '.join(origwords[i:i+span]) for i in range(0, len(origwords), span)]
spanedstemtext = [' '.join(stemed[i:i+span]) for i in range(0, len(stemed), span)]
reversedtext = iter(spanedstemtext)
results = []
densities = []
for i in xrange(maxlen/2):
results.append(0)
for i in reversedtext:
count = sum(1 for m in re.finditer(pattern, i))
if count:
results.append(count)
else:
results.append(0)
for i in xrange(maxlen/2):
results.append(0)
for i in xrange(maxlen/2,len(results)-maxlen/2):
densities.append(sum(results[i-maxlen/2:i-maxlen/2+maxlen])*1.0/maxlen)
threshold = 1
current = 0
for i in spanedorigtext:
if len(i)>10:
if densities[current] > threshold:
acknowledgments.append(i)
current+=1
return '\n'.join(acknowledgments)
textacknowledgmentsstem.registered=True
# without tara: pattern=r'(?:\bthank)|(?:\barticl)|(?:\bpmc\b)|(?:\bsupport)|(?:\bsampl)|(?:\bexpedit)|(?:\bfoundat)|(?:\bresearch)|(?:\bhelp)|(?:\binstitut)|(?:\bmarin)|(?:\bnation)|(?:\backnowledg)|(?:\bcomment)|(?:\bcontribut)|(?:\bfund)|(?:\bgrate)|(?:\bprovid)|(?:\bproject)|(?:\bpossibl)|(?:\bscienc)|(?:author)|(?:grant)|(?:fellowship)|(?:program)|(?:programm)|(?:suggest)|(?:taraexpedit)|(?:université)|(?:valuabl)|(?:without)|(?:pmc articles)|(?:oceans expedition)|(?:oceans consortium)|(?:anonymous reviewers)|(?:article contribution)|(?:environment foundation)|(?:people sponsors)|(?:projects? poseidon)|(?:wish thank)|(?:commitment following)|(?:continuous support)|(?:data analysis)|(?:exist without)|(?:tara girus)|(?:keen thank)|(?:oceans taraexpeditions)|(?:possible thanks)|(?:sponsors made)|(?:technical assistance)|(?:thank commitment)|(?:without continuous)'
# with tara: pattern=r'(?:\bthank)|(?:\btara\b)|(?:\barticl)|(?:\bocean\b)|(?:\bpmc\b)|(?:\bsupport)|(?:\bsampl)|(?:\bexpedit)|(?:\bfoundat)|(?:\bresearch)|(?:\bhelp)|(?:\binstitut)|(?:\bmarin)|(?:\bnation)|(?:\backnowledg)|(?:\bcomment)|(?:\bcontribut)|(?:\bfund)|(?:\bgrate)|(?:\bprovid)|(?:\bproject)|(?:\bpossibl)|(?:\bscienc)|(?:author)|(?:grant)|(?:fellowship)|(?:program)|(?:programm)|(?:suggest)|(?:taraexpedit)|(?:université)|(?:valuabl)|(?:without)|(?:pmc articles)|(?:tara oceans)|(?:oceans expedition)|(?:oceans consortium)|(?:anonymous reviewers)|(?:article contribution)|(?:environment foundation)|(?:people sponsors)|(?:projects? poseidon)|(?:wish thank)|(?:commitment following)|(?:continuous support)|(?:data analysis)|(?:exist without)|(?:tara girus)|(?:tara schooner)|(?:keen thank)|(?:oceans taraexpeditions)|(?:possible thanks)|(?:sponsors made)|(?:technical assistance)|(?:thank commitment)|(?:without continuous)'
# with tara v1: pattern=r'(?:\bthank)|(?:\bpmc\b)|(?:\bsupport)|(?:\bsampl)|(?:\bfoundat)|(?:\bresearch)|(?:\bhelp)|(?:\binstitut)|(?:\bnation)|(?:\backnowledg)|(?:\bcomment)|(?:\bcontribut)|(?:\bfund)|(?:\bgrate)|(?:\bprovid)|(?:\bproject)|(?:\bpossibl)|(?:\bscienc)|(?:author)|(?:grant)|(?:fellowship)|(?:program)|(?:suggest)|(?:université)|(?:valuabl)|(?:without)|(?:pmc articles)|(?:oceans consortium)|(?:anonymous reviewers)|(?:article contribution)|(?:environment foundation)|(?:people sponsors)|(?:projects? poseidon)|(?:wish thank)|(?:commitment following)|(?:continuous support)|(?:data analysis)|(?:exist without)|(?:keen thank)|(?:possible thanks)|(?:sponsors made)|(?:technical assistance)|(?:thank commitment)|(?:without continuous)'
def textacknowledgmentstara(txt, span=20, maxlen=7,
pattern=r'(?:crew)|(?:research)|(?:acknowledgements)|(?:acknowledge)|(?:acknowledg)|(?:assistance)|(?:commitment of)|(?:comments)|(?:particular(?:ly)?)|(?:fellowships?)|(?:authors?)|(?:program(?:s|mmes?)?)|(?:projects?)|(?:institutes?)|(?:sponsors)|(?:\bthanks?\b)|(?:possible)|(?:\bgrant(?:ed|s)?)|(?:\bsampl(?:e[sd]?|ing))|(?:\bsupport(?:ing)?\b)|(?:foundation)|(?:expedition)|(?:anr-)|(?:\bthis work\b)|(?:\bfunded by\b)|(?:\bthis study\b)|(?:following (?:people|individuals|institutions?|organizations?|sponsors))|(?:contribution (?:no|number))|(?:\bwish thanks?\b)|(?:\b23 institutes\b)|(?:\bgrateful(?: in| to| for|ly))|(?:supported (?:by|in))|(?:continuous support)|(?:exist without)|(?:following people)|(?:without continuous support)|(?:part of the)'):
"""
.. function:: textacknowledgments(text, span = 10, maxlen = 5, pattern = (\b|_)(1|2)\d{3,3}(\b|_))
Returns the "Reference" section of documents. To find it, it searches for parts of the document that
have a high density of pattern matches.
.. parameters:: txt,maxlen,pattern
txt: input text.
span: the size of the string in words that the txt is splited
maxlen: the size of the scrolling window over the text in which the density is calculated.
pattern: regular expression that is matched against the lines of the text. By default the pattern matches
year occurences so as to extract sections that look like references.
Examples:
>>> sql("select textacknowledgments('')")
textacknowledgments('')
------------------
<BLANKLINE>
"""
# clean text from \r\n
exp = re.sub('\r\n', '\n', txt)
# dedublicate spaces
exp = reduce_spaces.sub(' ', strip_remove_newlines.sub('', exp))
# if text is small, return it
if exp.count(' ') < span * 10:
return exp
acknowledgments = []
origwords = exp.split(' ')
words = exp.lower()
words = words.split(' ')
stemed = []
# for k in words:
# if len(k) > 0:
# stemed.append(porter.stem(k))
spanedorigtext = [' '.join(origwords[i:i + span]) for i in range(0, len(origwords), span)]
spanedstemtext = [' '.join(words[i:i + span]) for i in range(0, len(words), span)]
reversedtext = iter(spanedstemtext)
results = []
densities = []
for i in xrange(maxlen / 2):
results.append(0)
for i in reversedtext:
count = sum(1 for m in re.finditer(pattern, i))
if count:
results.append(count)
else:
results.append(0)
for i in xrange(maxlen / 2):
results.append(0)
# print len(spanedorigtext), len(spanedstemtext), len(results), len(results)-maxlen/2 - maxlen/2
out = 0
temp = 0
for i in xrange(maxlen / 2, len(results) - maxlen / 2):
densities.append(sum(results[i - maxlen / 2:i - maxlen / 2 + maxlen]) * 1.0 / maxlen)
# for cnt, i in enumerate(spanedorigtext):
# print results[maxlen/2+cnt], densities[cnt], i
threshold = 1
paragraphsum = []
paragraphs = []
prev = -10
current = 0
maxsum = 0
maxi = 0
for line in spanedorigtext:
if densities[current] > threshold:
# new paragraph first visit
if (prev + 1) != current:
paragraphsum.append(0)
paragraphs.append([])
paragraphsum[-1] += results[maxlen / 2 + current]
paragraphs[-1].append(line)
prev = current
current += 1
for cnt, paragraph in enumerate(paragraphs):
if paragraphsum[cnt] > maxsum:
maxsum = paragraphsum[cnt]
maxi = cnt
# print '\n'.join(paragraph), paragraphsum[cnt], '\n'
# print '!!!!!!!!', maxsum, maxi
paragraphsum.append(0)
paragraphs.append([])
if paragraphsum[maxi] > 2:
return '\n'.join(paragraphs[maxi])
# return ('\n'.join(paragraphs[maxi]))+" "+str(paragraphsum[maxi])
else:
return ''
textacknowledgmentstara.registered = True
if not ('.' in __name__):
"""
This is needed to be able to test the function, put it at the end of every
new function you create
"""
import sys
import setpath
from functions import *
testfunction()
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding('utf-8')
import doctest
doctest.testmod()