# -*- coding: utf-8 -*-
"""
Created on Sat Apr 27 20:08:25 2024
@author: Arnd Helmut Hafner
"""
#processes the index data in order to tag the original text
import sys # for sys.exit after error
import re
import tools
import copy
from re import compile, search
import inputmethods
#Creation of dictionary of index data accessible by slip number
def slipnofrqtest(sliplist):
'''
Checking the consistency of frequency numbers in records of slipnodic
Parameters
----------
sliplist : list
One value of slipnodic
[[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]
Returns
-------
inconsistency : list
list of the positions of records that include inconsistencies.
'''
inconsistency = []
for n in range(len(sliplist)):
for m in range(1,len(sliplist[n][6])):
if sliplist[n][6][m] != sliplist[n][6][0]:inconsistency.append(n)
if sliplist[n][4] and sliplist[n][6] and sliplist[n][6][0] != sliplist[n][4] and n not in inconsistency:
#if there are no items,frq will be zero; if there are no refitems, reffrq will be an empty list.
inconsistency.append(n)
return(inconsistency)
def createdic(lemmata,items,refitems):
'''creating a dictionary for access to data by slip number
Parameters
----------
lemmata : list
#[(headword,headst,ID)]
items : list
[(stringori,stringreg,lemmatano,slipno,slipnote,notes)]
refitems : list
[(stringori,stringreg,lemmatano,slipno,slipnote,notes)]
Returns
-------
slipnodic :dic
{slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]}
[stringori],[stringreg],[lemmatano] are put in list form
in preparation for merging overlapping strings
[reflenno],[reffrq] are put in list form because every string
appears once under the main lemma and numerous times under different
reference lemmata.
'''
slipnodic = {}
tempdic = {}#remembers the record structure within each value list in slipnodic
#Entering item data into slipnodic
for n in range(len(items)):
stringori,stringreg,lemmatano,slipno,slipnote,notes = items[n]
ID = lemmata[lemmatano][2]
tempkey = stringori+stringreg+ID
if slipno not in slipnodic.keys():
slipnodic[slipno] =[[[stringori],[stringreg],ID,[lemmatano],1,[],[]]]
tempdic[slipno] = {}
tempdic[slipno][tempkey] = 0
#using stringori+stringreg+ID as key because same original string
#can be assigned two different standardized forms. E.g.:
#司空、尉主→尉主 and 司空、尉主→司空主
#佐蒲、就→(啓陵郷)佐蒲 and 佐蒲、就→(啓陵郷)佐就
#Additionally, identical strings can be appear simultaneously in different indices
else:
if tempkey not in tempdic[slipno].keys():
tempdic[slipno][tempkey] = len(slipnodic[slipno])
slipnodic[slipno].append([[stringori],[stringreg],ID,[lemmatano],1,[],[]])
elif slipnodic[slipno][tempdic[slipno][tempkey]][3][0] ==lemmatano:
slipnodic[slipno][tempdic[slipno][tempkey]][4] += 1
#tempdic[slipno][tempkey]: fieldnumber
#slipnodic[slipno][tempdic[slipno][tempkey]][2][0]:lemmatano
else: sys.exit('Error in item loop of createdic\n{}'.format(items[n]))
#entering refitem data into slipnodic
for n in range(len(refitems)):
stringori,stringreg,lemmatano,slipno,slipnote,notes = refitems[n]
ID = lemmata[lemmatano][2]
tempkey = stringori+stringreg+ID
if slipno not in slipnodic.keys():
slipnodic[slipno] =[[[stringori],[stringreg],ID,[],0,[lemmatano],[1]]]
tempdic[slipno] = {}
tempdic[slipno][tempkey] = 0
else:
if tempkey not in tempdic[slipno].keys():
tempdic[slipno][tempkey] = len(slipnodic[slipno])
slipnodic[slipno].append([[stringori],[stringreg],ID,[],0,[lemmatano],[1]])
elif lemmatano in slipnodic[slipno][tempdic[slipno][tempkey]][5]:
i = tools.lookuplistitem(lemmatano,slipnodic[slipno][tempdic[slipno][tempkey]][5])
i = i[0]
slipnodic[slipno][tempdic[slipno][tempkey]][6][i] += 1
else:
slipnodic[slipno][tempdic[slipno][tempkey]][5].append(lemmatano)
slipnodic[slipno][tempdic[slipno][tempkey]][6].append(1)
#Sorting slipnodic in declining order of length of stringori
#Will be necessary when checking overlaps between strings
logfile = open('logcreatdict(frequencycheck).txt','w',encoding='utf-8-sig')
for slipno in slipnodic.keys():
slipnodic[slipno] = sorted(slipnodic[slipno] , key=lambda x: len(re.findall('.',x[0][0]))*-1)
#checking appearance frequencies
inconsistency = slipnofrqtest(slipnodic[slipno])
for n in inconsistency:logfile.write('inconsistency in record {} of slip {}\n{}\n'.format(n,slipno,slipnodic[slipno][n]))
#merging frqdata (after above check didn't show any inconsistency)
for n in range(len(slipnodic[slipno])):
if not slipnodic[slipno][n][4]: slipnodic[slipno][n][4] = slipnodic[slipno][n][6][0]
return(slipnodic)
#Preliminary consistency check
def simplefrqcheck(slipno,recordno,logfile):
#
pass
def simpleconsistencycheck(slipnodic,sliptext,filename):
'''
Checks data consistency by comparing appearance of index data in slip text data
and outputs the result to a logfile
Parameters
----------
slipnodic : dict
{slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]}
sliptext : dict
{slipno:[sliptext,docstyle,posinstyle,posindoc]}
filename : TestIOWrapper
Ouput
-------
inconsistencies :
a)slip numbers not found in slip text data
b)strings with more or less appearances in the text than in the indices
'''
logfile = open(filename,'w',encoding='utf-8-sig')
for slipno in slipnodic.keys():
if slipno not in sliptext.keys():
logfile.write('slip number {} not found in sliptext\n'.format(slipno))
continue
for n in range(len(slipnodic[slipno])):
#frq = simplefrqcheck(slipno,recordno,logfile)
stringori = slipnodic[slipno][n][0][0]
searchstring = tools.searchpreparation(stringori,'[┘## 〼■(?)、,]*')
frqindex = slipnodic[slipno][n][4]
frqtext = len(re.findall(searchstring,sliptext[slipno][0]))
if frqtext == frqindex:continue
logfile.write('frequency inconsistency for {} in slip {}\n'
'frequency in index is {} , frequency in text is {}\n'
'Full text is as follows:\n{}\n'
'\n'.format(stringori,slipno,frqindex,
frqtext,sliptext[slipno][0]))
logfile.close()
return
#Original text data string
def stringorireplacement(slipnodic,sliptext,filename):
'''
Replaces index data strings with text data strings
Stringori is already a simplified version of the original text data strings.
Thus, strings must be reversed into their original form in order to guarantee
accurate tagging of original text data
Parameters
----------
slipnodic : dict
{slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]}
sliptext : dict
{slipno:[sliptext,docstyle,posinstyle,posindoc]}
filename : TestIOWrapper
Returns
-------
slipnodic : dict
stringori is replaced by correspondent string in original slip text
'''
logfile = open(filename,'w',encoding='utf-8-sig')
for slipno in slipnodic.keys():
if '~' in slipno:#index data stretch over two slips
#creating a temporary sliptext out of the
#last 7 characters of slip1 and the first 7 characters of slip2
temp = slipno.split('~')
sliptexttemp = re.findall('[^#]{1,7}#[^#]{1,7}',sliptext[temp[0]][0] + '#' + sliptext[temp[1]][0])
sliptexttemp = sliptexttemp[0]
elif slipno not in sliptext.keys():
logfile.write('Slip number {} not found in slip text\n\n'.format(slipno))
continue
else:sliptexttemp = sliptext[slipno][0]
newrecords = []
for n in range(len(slipnodic[slipno])):
#finding strings in text
stringori = slipnodic[slipno][n][0][0]
searchstring = tools.searchpreparation(stringori,'[┘## 〼■、,()]*?')
if len(re.findall(stringori,sliptexttemp)) == len(re.findall(searchstring,sliptexttemp)):
hitsinsliptext = tools.listitemcount(re.findall(stringori,sliptexttemp))
else:#In the case that correct matching is impeded by special characters
hitsinsliptext = tools.listitemcount(re.findall(searchstring,sliptexttemp))
if not hitsinsliptext:
logfile.write('String {} not found in slip {}\n slip text is:\n{}\n\n'
.format(stringori,slipno,sliptexttemp))
continue
ID = slipnodic[slipno][n][2]
frqindex = slipnodic[slipno][n][4]
#elimination of unrelated strings ( = matches with longer strings from other indices)
if len(hitsinsliptext) > 1:
for m in range(n-1,-1,-1):
for strings in hitsinsliptext.keys():
if strings in slipnodic[slipno][m][0][0] and ID != slipnodic[slipno][m][2]:
hitsinsliptext[strings] -= slipnodic[slipno][m][4]
hitsinsliptext = {key: val for key, val in hitsinsliptext.items() if val != 0}
if not hitsinsliptext:#already non-existent
logfile.write('frequency inconsistency for {} in slip {} after first overlap check\n'
'no string left\n'
'Full text is as follows:\n{}\n\n'
'\n'.format(stringori,slipno,sliptexttemp))
for strings in hitsinsliptext.keys():
if hitsinsliptext[strings] < 0:#already non-existent
logfile.write('frequency inconsistency for {} in slip {} after first overlap check\n'
'frequency of {} dropped below 0\n'
'Full text is as follows:\n{}\n\n'
'\n'.format(stringori,slipno,strings,sliptexttemp))
itemlist = list(hitsinsliptext.keys())
#replacing strings
slipnodic[slipno][n][0][0] = itemlist[0]
slipnodic[slipno][n][4] = frqtext = hitsinsliptext[itemlist[0]]
if len(itemlist) == 1:
#len(itemlist) == 1 means that there is only one variation of the string in the text
#dealing with frequency inconsistencies
if frqindex == frqtext: continue
elif frqindex < frqtext:
#searching for overlaps in the same index
frqprocessed = frqtext
for m in range(n-1,-1,-1):
if stringori in slipnodic[slipno][m][0][0] and ID == slipnodic[slipno][m][2]:
frqprocessed -= slipnodic[slipno][m][4]
if frqindex != frqprocessed:
logfile.write('frequency inconsistency for {} in slip {} after second overlap check\n'
'frequency in index is {} , frequency in text originally is {}, after processing is {}\n'
'Full text is as follows:\n{}\n\n'
'\n'.format(stringori,slipno,frqindex,frqtext,
frqprocessed,sliptexttemp))
elif frqindex > frqtext:
#These occurrences have proved to be caused by errors in original data.
#No further occurences after correction of data
logfile.write('frequency inconsistency for {} in slip {}, more occurrences in index than in text\n'
'frequency in index is {} , frequency in text is {}\n'
'Full text is as follows:\n{}\n\n'
'\n'.format(stringori,slipno,frqindex,
frqtext,sliptexttemp))
#dealing with differences in string representation in slip text and frequency inconsistencies
else:#dealing with different variations of the string
frqprocessed = frqindex
frqprocessed -= frqtext
for x in range(1,len(itemlist)):
nrecord = copy.deepcopy(slipnodic[slipno][n])
nrecord[0][0] = itemlist[x]
frqtext = hitsinsliptext[itemlist[x]]
nrecord[4] = frqtext
frqprocessed -= frqtext
newrecords.append((nrecord,n))
if frqprocessed != 0:
logfile.write('frequency inconsistency for {} in slip {} after dealing with different representations\n'
'frequency in index originally is {}, after processing is {}, frequency in text is {}\n'
'Full text is as follows:\n{}\n\n'
'\n'.format(stringori,slipno,frqindex,frqprocessed,
frqtext,sliptexttemp))
slipnodic[slipno] = tools.listaddrecords(newrecords,slipnodic[slipno])
return(slipnodic)
#stringprocessing. Bridging the gap between information stored in original strings
#and information stored in standardized strings
def bracketremoval(prepstring,beforeafter,regex,beforegroup,aftergroup,delinbefore = '',delinafter = ''):
'''
Removes brackets and other special symbol from string, and
accumulates removal information in form of tuples of string
parts before and after
Parameters
----------
prepstring : str
Initially equivalent to slipnodic[slipno][n][0] = stringori
beforeafter : list
list of string parts before and after removal of special characters
before = string part that is matched in order to remove special characters
after = string part after removal
regex : str
Regular that hits special characters
beforegroup :tuple
tuple of numbers that indicate the match groups that are used as string part before
aftergroup : tuple
tuple of numbers that indicate the match groups that are used as string part after
delinbefore : str, optional
Special characters that need to be extraordinarily deleted in string part before.
The default is ''.
delinafter : str, optional
Special characters that need to be extraordinarily deleted in string part before.
The default is ''.
Returns
-------
prepstring,beforeafter.
'''
bcheck = re.finditer(regex,prepstring)
for a in bcheck:
before = ''
for n in beforegroup:
if a.group(n):before += a.group(n)
before = re.sub(delinbefore,'',before)
after = ''
for n in aftergroup:
if a.group(n):after += a.group(n)
after = re.sub(delinafter,'',after)
prepstring = re.sub(before,after,prepstring,1)
beforeafter.append((before,after))
return(prepstring,beforeafter)
def bracketcheck(prepstring):
'''
Removes brackets and some other symbols from original string
and returns tuples of string parts before and string parts after processing
Parameters
----------
prepstring : str
Returns
-------
beforeafter : list
list of tuples, entailing tuples of relevant string parts in the form before
and after processing
'''
beforeafter = []
#Finding "[字]" = 衍字 = redundant character. Needs surrounding characters to re-identify location after deletion
regex = '(.)?([({[^}]+}|.)])(.)?'
prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(1,4),'','?')
#finding "字(字)" . Contamination by "?" doesn't impede matching.
#({[^}]+}|[^}﹦〗]) hits unicode external characters.{[^}]+} especially aims at code external
#characters; [^}] avoids double match on last bracket of external characters, and [^﹦〗]
#avoids other special characters
regex = '({[^}]+}|[^}﹦〗])([(〔]??)([^?\-)〕]+)(??[)〕])'
prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(3,),'','?')
#Finding ""字(字-字)". Contamination by "?" doesn't impede matching.
regex = '({[^}]+}|[^}﹦〗])([(〔]??[^?]\-)([^?])(??[)〕])'
prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(3,),'','?')
#Finding "字(?)"
regex = '({[^}]+}|[^>])((?))'
prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(1,))
#Finding ")字"
regex = '()([^<]+)()'#囲い文字を検出
prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(2,))
#duplication symbols
if re.search('﹦.﹦',prepstring):
#For now, actually only one case:"琅﹦邪﹦守" in slip 8-0657a
regex = '([^﹦])﹦'
prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(1,),'[]')
elif '﹦' in prepstring:
regex = '(.)([〖]*﹦[〗]*()([^)]+)())'
prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(3,),'[]')
return(prepstring,beforeafter)
def fracturecheck(prepstring,stringreg,logfile):
'''
Reads information about text lost by slip fractures and writes them into prepstring.
This information needs to be kept stored after tagging
Parameters
----------
prepstring : str
stringreg : str
Returns
-------
prepstring : str
beforeafter : list
a list of tuples
'''
beforeafter = []
fracori = re.findall('〼 〼|〼……|……〼|……|〼', prepstring)
fracreg = re.findall('【[^】]+】', stringreg)
if len(fracori) == len(fracreg):
for a in range(len(fracori)):
beforeafter.append((fracori[a],fracreg[a]))
prepstring = re.sub(fracori[a],fracreg[a],prepstring)
else:
if '/' in stringreg:
#During index data consolidation, two stringreg have been merged for some records
#resulting in "stringreg1/stringreg2
#The next three lines split stringreg again, sort them in declining order of length
#and pick out the longest one
stringreg = stringreg.split('/')
stringreg = sorted(stringreg , key=lambda x: len(re.findall('.',x))*-1)
stringreg = stringreg[0]
prepstring,beforeafter = fracturecheck(prepstring,stringreg,logfile)
else:
logfile.write('fracture not reflected correctly: {}\t{}\n'.format(prepstring,stringreg))
return(prepstring,beforeafter)
def addinfocheck(prepstring,stringreg,beforeafter = []):
'''
Reads the additional information that is stored in standardized strings
in form of bracketed annotations.
This information can't kept stored in the tagged text.
Deletion, however, will create empty tags.
Parameters
----------
prepstring : str
stringreg : str
beforeafter : list,optional
default vale = []; can pass on values in case of recursive use of function
Returns
-------
prepstring : str
beforeafter : list
a list of tuples
'''
#Part of additional information is linked to information on slip fracture
#This is already processed and needs to be exempted here
stringreg = re.sub('【[^(】]*([^】]+】','',stringreg)
if re.search('^[^(]*([^)/]+)[^(/]*/[^(/]*([^)/]+)[^(]*$',stringreg):
#Hits multiple stringreg expressions "stringreg1/stringreg2".
#Doesn't hit "沅陽(令/長)" etc.
#"stringreg1/stringreg2" is split into single stringreg and feeded into recursive call of function
temp = stringreg.split('/')
for stringreg in temp:
prepstring,beforeafter = addinfocheck(prepstring,stringreg,beforeafter)
elif '(' in stringreg:
#Two different patterns of additional information
#abc(def)ghi: Only one add info string. def = add info
m1 = re.search('^([^(]*)(([^)]+))([^(]*)$',stringreg)
#abc(def)ghi(jkl)mno: Two add info strings. def,jkl = add infos
m2 = re.search('(([^)]+))([^(]+)(([^)]+))',stringreg)#付加情報が二つある場合
#m2 also hits "御史(大夫)/(監)御史". Order of application changes results!
if m1:
#check for misinterpretation
#m1.group(2) in prepstring:add info already entailed in original string
#m1.group(1/3) not in prepstring: location to add information not specified
if m1.group(1) not in prepstring and m1.group(3) not in prepstring:
sys.exit('Failure in addinfocheck m1\nprepstring is:{}\nstringreg is {}'.format(prepstring,stringreg))
elif m1.group(1):
before = m1.group(1)[-1]#the last character of string part before add info
after = m1.group(1)[-1] + m1.group(2)
elif m1.group(3):
before = m1.group(3)[0]#the first character of string part behind add info
after =m1.group(2) + m1.group(3)[0]
if beforeafter and after in beforeafter[0]:
pass
else:
beforeafter.append((before,after))
prepstring = re.sub(before,after,prepstring)
elif m2:
#check for misinterpretations
#m2.group(2) == '':no characters between two add infos→location to add information not specified
#m2.group(2) not in prepstring:location to add information not specified
if m2.group(2) == '' or m2.group(2) not in prepstring:
sys.exit('Failure in addinfocheck m2\nprepstring is:{}\nstringreg is {}'.format(prepstring,stringreg))
before = m2.group(2)
after = m2.group(1) + m2.group(2) +m2.group(3)
beforeafter.append((before,after))
prepstring = re.sub(before,after,prepstring)
else:
sys.exit('mismatch in addinfocheck:{}\t{}\n'.format(prepstring,stringreg))
return(prepstring,beforeafter)
def stringprep(slipnodic,filename):
'''
Prepares strings for tag matching.
Removes or adjusts special symbols in the original string
Extracts extra information stored in standardized string forms
Parameters
----------
slipnodic : Dic
{slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]}
filename : TYPE
Returns
-------
slipnodic : dic
{slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
'prepstring']]}
'''
logfile = open(filename,'w',encoding='utf-8-sig')
for slipno in slipnodic.keys():
for n in range(len(slipnodic[slipno])):
slipnodic[slipno][n].append(['brackets',[],'fractures',[],'addinfo',[]])
prepstring = slipnodic[slipno][n][0][0]
#brackets
prepstring,slipnodic[slipno][n][7][1] = bracketcheck(prepstring)
#fracture symbols
stringreg = slipnodic[slipno][n][1][0]
prepstring,slipnodic[slipno][n][7][3] = fracturecheck(prepstring,stringreg,logfile)
prepstring,slipnodic[slipno][n][7][5] = addinfocheck(prepstring,stringreg,[])
slipnodic[slipno][n].append(prepstring)
return(slipnodic)
#Consistency checks and consolidation of data
def reflectaddinfo(addinfolist,prepstring):
'''
Reflecting add info stored in addinfolist on prepstring.
Used for two situations:
Equal merge of strings like "司空、尉主" or "倉、司空主".
Prepstring differ like "司空、尉主" and "司空(司空)、尉主" or "倉(司空)、司空主" and "倉、司空(司空)主"
Acquisition merger of strings like "其一人為田鼂養:成" and "田(嗇夫)鼂"
Original string differs because of different edition policies in different indices
Parameters
----------
addinfolist : TYPE
DESCRIPTION.
prepstring : TYPE
DESCRIPTION.
Returns
-------
None.
'''
for n in range (len(addinfolist)):
strwithoutaddinfo,strwithaddinfo = addinfolist[n]
if strwithoutaddinfo in prepstring and strwithaddinfo not in prepstring:
prepstring = re.sub(strwithoutaddinfo,strwithaddinfo,prepstring)
return(prepstring)
def concordancecheck(datalist):
'''
Searches for complete concordances between strings
within the same single value of slinodic
Parameters
----------
datalist : list
[[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq],
['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],'prepstring']]}
Returns
-------
concordances : list
[(a,b)]
list of tuples. Every tuple entails the position of two records with concordant strings.
a is to be incorporated into b.
'''
#finding concordant records
concordances = []
for a in range(len(datalist)-1,0,-1):
stringoria = datalist[a][0][0]
##precaution "(充)令" and "(有酉)令" will be mistakenly taken as concordant
#Using prepstring leads to confusion of "□季(?)" and "□季" etc
#and ommission of "司空、尉主" and "倉、司空主" etc.
for b in range(a-1,-1,-1):
stringorib = datalist[b][0][0]
if stringoria == stringorib:
concordances.append((a,b))
#deleting abundant record pairs
#5 4
#5 3
#4 3
#→5 3/5 4 needs to be processed only once
for a in range(len(concordances)-2,-1,-1):
if concordances[a][0] == concordances[a+1][0]:del concordances[a]
return(concordances)
def equalmerge(positions,datalist,slipno,logfile):
'''
Merges two records within a single value of slipnodic.
Checks frequencies before merging.
Parameters
----------
positions : tuple
(a,b) = a is to be absorbed in b
datalist : list
equivalent to slipnodic[slipno]
[[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq],
['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],'prepstring']]}
[stringori],[stringreg],[lemmatano] still have only one value each.
logfile : TextIOWrapper
Returns
-------
datalist : list
'''
a,b = positions
#reconfirmation
if datalist[a][0] != datalist[b][0]:
logfile.write('Inconsistency of original string form in equalmerge for record {} and {} of slip {}\n{}\n'
'\n'.format(b,a,slipno,datalist))
#Merging data
#prepstring
if datalist[a][8] != datalist[b][8]:
#checking for addinfo in a not reflected in b
#e.g."司空、尉主" and "司空、尉主"
datalist[b][8] = reflectaddinfo(datalist[a][7][5],datalist[b][8])
if datalist[a][8] != datalist[b][8]:
logfile.write('Inconsistency of prepstring in equalmerge for record {} and {} of slip {}\n{}\n'
'\n'.format(b,a,slipno,datalist))
if datalist[a][1] != datalist[b][1]:
datalist[b][1][0] += '/' + datalist[a][1][0]
datalist[b][2] += '/' + datalist[a][2]
if datalist[a][3] and datalist[b][3]:
datalist[b][3] += datalist[a][3]
datalist[b][5] += datalist[a][5]
datalist[b][6] += datalist[a][6]
fields = (1,3,5)
for n in fields:
datalist[b][7][n] += datalist[a][7][n]
#checking frequency and deleting record a
if datalist[a][4] and datalist[a][4] == datalist[b][4]:del datalist[a]
elif not datalist[a][4] and datalist[a][6][0] == datalist[b][6][0]:del datalist[a]
elif datalist[a][4] > datalist[b][4]:datalist[a][4] -= datalist[b][4]
elif datalist[a][6] and datalist[a][6][0] > datalist[b][6][0]:datalist[a][6][0] -= datalist[b][6][0]
else:logfile.write('Frequency Inconsistency in equalmerge for record {} and {} of slip {}\n{}\n'
'\n'.format(b,a,slipno,datalist))
return(datalist)
def overlapcheck(datalist):
'''
Searches for partial overlaps between strings
within the same single value of slinodic.
Only feasible after elimination of complete concordances.
Parameters
----------
datalist : list
equivalent to slipnodic[slipno]
[[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq],
['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],'prepstring']]
[stringori],[stringreg],[lemmatano] still have only one value each.
Returns
-------
datalist : list
'''
overlapping = {}
for a in range(1,len(datalist)):
stringoria = datalist[a][0][0]
prepstringa = datalist[a][8]
#Using prepstring leads to confusion of "□季(?)" and "□季" etc
IDa = datalist[a][2]
for b in range(0,a):
stringorib = datalist[b][0][0]
prepstringb = datalist[b][8]
IDb = datalist[b][2]
if (stringoria in stringorib or prepstringa in prepstringb) and IDa not in IDb:
if b in overlapping.keys():
overlapping[b].append(a)
else:
overlapping[b] = [a]
return(overlapping)
def acquisitionmerge(positions,datalist,slipno,delrecords,logfile):
'''
Absorbs records that entail strings concordant to parts of other strings into their records
Parameters
----------
positions : tuple
(a,b) = a is to be absorbed in b
datalist : list
equivalent to slipnodic[slipno]
[[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq],
['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],[prepstring]]]}
[stringori],[stringreg],[lemmatano] still have only one value each.
slipno : str
delrecords : list
Deletion of unnecessary records would might lead to changes of positions and mis-recognition.
Therefore, positions of deletable records are accumulated and used for deletion after finishing of all merging processes
logfile : TextIOWrapper
Returns
-------
datalist : list
[stringori],[stringreg],[lemmatano] obtained multiple values.
delrecords: list
'''
a,b = positions
#Absorbing [stringori],[stringreg],ID,[lemmatano],[prepstring] of a into b
datalist[b][8][0] = reflectaddinfo(datalist[a][7][5],datalist[b][8][0])
fields = (0,1,2,3,5,8)
for n in fields:
datalist[b][n] +=datalist[a][n]
#Absorbing [(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)],'addinfo',[(strwithoutaddinfo,strwithaddinfo)]]
fields = (1,3,5)
for n in fields:
datalist[b][7][n] += datalist[a][7][n]
#Adjustment of frequency numbers
if datalist[a][4] : frqa = datalist[a][4]
else: frqa = datalist[a][6][0]
if datalist[b][4] : frqb = datalist[b][4]
else: frqb = datalist[b][6][0]
frqainb = len(re.findall(datalist[a][0][0],datalist[b][0][0]))
frqainbprp = len(re.findall(datalist[a][8][0],datalist[b][8][0]))
if frqainb < frqainbprp:frqainb = frqainbprp
frqa = frqa-(frqb*frqainb)
if frqa == 0 and a not in delrecords: delrecords.append(a)
elif frqa > 0 and datalist[a][4]:datalist[a][4] = frqa
elif frqa > 0:datalist[a][6][0] = frqa
else:logfile.write('frequency inconsistency in acquisitionmerge for record {} '
'and {} in slip {}\n{}\n\n'.format(b,a,slipno,datalist))
return(datalist,delrecords)
def indexdataconsolidation(slipnodic,filename):
'''
Searches and merges concordant and overlapping records
within individual values of slipnodic
Parameters
----------
slipnodic : dict
{slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
'prepstring']]}
filename : str
name of logfile
Returns
-------
slipnodic : dict
{slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
[prepstring]]]}
'''
logfile = open(filename,'w',encoding='utf-8-sig')
for slipno in slipnodic.keys():
#Checking for concordances
concordances = concordancecheck(slipnodic[slipno])
#merging concordant records
for n in range(len(concordances)):
slipnodic[slipno] = equalmerge(concordances[n],slipnodic[slipno],slipno,logfile)
#Checking for overlaps
overlapping = overlapcheck(slipnodic[slipno])
#Changing IDs and preprstrings into lists
fields = (2,8)
for n in range(len(slipnodic[slipno])):
for m in fields:
slipnodic[slipno][n][m] = [slipnodic[slipno][n][m]]
#Absorbing overlapping records
delrecords = []
for b in sorted(overlapping.keys()):
for a in overlapping[b]:
slipnodic[slipno],delrecords = acquisitionmerge((a,b),slipnodic[slipno],slipno,delrecords,logfile)
delrecords.sort(reverse=True)
for n in range (len(delrecords)):del slipnodic[slipno][delrecords[n]]
#deleting duplications in stringprep info (brackets,fracures,addinfo)
fields = (1,3,5)
for n in range(len(slipnodic[slipno])):
for m in fields:
slipnodic[slipno][n][7][m] = tools.listduplicationdel(slipnodic[slipno][n][7][m])
return(slipnodic)
#tagpreparation
def tagpreplookuplemmata(lemmatano,lemmata):
'''
Looks up lemmata from lammata list by lemmata number and
creates temporary dic for connecting lemmata with lemmata number.
Lemmata number will be necessary for creation of tags
Parameters
----------
lemmatano : Int
lemmata : list
[(headword,headst,ID)]
Returns
-------
lemmatainstr : list
List of lemmata entailed in the string
temp : dic
{lemma:[lemmatano]}
necessary in order to create tags after sorting lemmata by length
'''
lemmatainstr = []
temp = {}
#extracting lemmata
for n in range(len(lemmatano)):
lemma = lemmata[lemmatano[n]][1]
if lemma not in temp.keys():
temp[lemma] = [lemmatano[n]]
lemmatainstr.append(lemma)
elif temp[lemma] != lemmatano[n]:
temp[lemma].append(lemmatano[n])
#sorting lemmata in descending order of length
lemmatainstr = sorted(lemmatainstr , key=lambda x: len(re.findall('.',x))*-1)
return(lemmatainstr,temp)
def sortlemmatano(lemmatainstr,temp):
'''
Sorts lemmata numbers in the order of lemmatainstr and returns them as a list of tag numbers
Parameters
----------
lemmatainstr : list
List of lemmata related to string
temp : dic
{lemma:[lemmatano]}
Returns
-------
tags : list
['lemmatano']
In case of different lemmata numbers for lemmata with identical standardized writing:
'lemmatano1/lemmatano2'
'''
numbers = []
for n in range(len(lemmatainstr)):
numbers.append('')
lemma = lemmatainstr[n]
for lemmatano in temp[lemma]:
if str(lemmatano) not in numbers[n]:
numbers[n] += str(lemmatano) + '/'
numbers[n] = re.sub('/$','',numbers[n])
return(numbers)
def tagprep(slipnodic,lemmata,filename):
'''
looks up lemmata related to the string, sort them in descending order of length and
creates tags
Parameters
----------
slipnodic : dic
{slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
[prepstring]]]}
[lemmatano] in order of appearance in index data
lemmata : list
[(headword,headst,ID)]
filename : TextIOWrapper
Returns
-------
Slipnodic : dic
{slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
[prepstring],[lemmata]]]}
[lemmatano] in order of [lemmata]
[lemmata] in descending order of length
'''
for slipno in slipnodic.keys():
for a in range(len(slipnodic[slipno])):
#looking up lemmata and adding them to slipnodic
lemmatano = slipnodic[slipno][a][3] + slipnodic[slipno][a][5]
lemmatainstr,temp = tagpreplookuplemmata(lemmatano,lemmata)
slipnodic[slipno][a].append(lemmatainstr)
#resorting lemmata numbers in order of lemmatainstr
lemmatano = sortlemmatano(lemmatainstr,temp)
slipnodic[slipno][a][3] = lemmatano
return(slipnodic)
#tagging
def inserttag(prepstring,tags,tagobjects,prefix,suffix,n,slipno,record,logfile):
'''
Inserts tag into prepstr, enclousing taggablestrings
Parameters
----------
prepstring : str
slipnodic[slipno][n][8][0]
tags : list
tagobjects : list
prefix : str
if '(<\d+>)*', tagged string part will include preceding start tags
if '', tagged string part won't include any preceding start tags
suffix : str
if '(\d+>)*', tagged string part will include preceding end tags
if '', tagged string part won't include any preceding end tags
---below is only for logfile use---
n : int
number of record in slipnodic[slipno]
slipno : str
record : list
slipnodic[slipno][n]
logfile : TextIOWrapper
Returns
-------
pepstring
'''
if len(tags) != len(tagobjects):sys.exit('Number of tagobjects not equal to number'
'of tagobjects numbers in record {} of slip {}\ntags are {}\n{}\n\n'.format(n,slipno,tags,record))
for a in range(len(tagobjects)):
searchstring = tools.searchpreparation(tagobjects[a],'[┘## 〼■、,a-z()]*',prefix,suffix)
#searchpreparationは本来[<\d/>()]*で充分であるが、 stringprepにおいてsliptextに基づいて改めた
#stringoriには[┘## 〼■、,a-z()]によって検出された特殊記号が含まれるから、ここも
#それを正規表記に含めなければならない。
#なお、本ファンクションにtagobjectsとして渡されるstringorisは実際はstringregに基づいており、
#stringprepによる変更を反映していない。本来はそこでstringregを改めるか、 stringorisにstringoriを
#用いるべきであるが、諸般の事情でそれができない。
m = re.search(searchstring,prepstring)
if m:
before = m.group(0)
hits = re.findall(before,prepstring)
if len(hits) > 1:logfile.write('tagobject {} found more than one time'
' in prepstring {} for slip {}'
.format(before,prepstring,slipno))
elif len(hits) == 0:logfile.write('tagobject {} not found in prepstring {} for slip {}'
.format(before,prepstring,slipno))
after = '<' + str(tags[a]) + '>' + before + '' + str(tags[a]) + '>'
prepstring = re.sub(before,after,prepstring)
else:
logfile.write('mismatch for lemma {} in slip {}\nfull record:\n{}'
'\n\n'.format(tagobjects[a],slipno,record))
return(prepstring)
def tagging(slipnodic,filename):
'''
Tags prepstr according to list of lemmata
Parameters
----------
slipnodic : dic
{slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
[prepstring],[lemmata]]]}
filename : str
Returns
-------
slipnodic : dic
{slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
[tagstring],[lemmata]]]}
prepstring is replaced by tagstring (tagged string)
'''
logfile = open(filename,'w',encoding='utf-8-sig')
for slipno in slipnodic.keys():
for n in range(len(slipnodic[slipno])):
lemmatano,lemmata = slipnodic[slipno][n][3],slipnodic[slipno][n][9]
slipnodic[slipno][n][8][0] = inserttag(slipnodic[slipno][n][8][0],lemmatano,lemmata,'','',n,slipno,slipnodic[slipno][n],logfile)
IDs,stringoris = slipnodic[slipno][n][2],slipnodic[slipno][n][1]
slipnodic[slipno][n][8][0] = inserttag(slipnodic[slipno][n][8][0],IDs,stringoris,'(<\d+>)*','(\d+>)*',n,slipno,slipnodic[slipno][n],logfile)
logfile.close
return(slipnodic)
#Restoration of original string = Removal of string preparations
def removeaddinfo(tagstring,addinfo,logfile):
'''
Removes additional info originally deriving from bracketed info in stringreg
Parameters
----------
tagstring : str
equivalent to slipnodic[slipno][n][8]
addinfo = slipnodic[slipno][n][7][5]
addinfo : list
[(before,after)] = list of tuples.
Each tuples contains a pair of string parts before and after adding info
logfile : TextIOWrapper
Returns
-------
tagstring : str
'''
subslash = compile('([^<])/')
for a in range(len(addinfo)-1,-1,-1):
after = addinfo[a][1]#e.g."(啓陵)郵人(匄)","道(令/長)"
adds = re.findall('([^)]+)',after)#e.g.[(啓陵),(匄)],[(令/長)]
for b in range(len(adds)):
regex = tools.searchpreparation(adds[b],'[<\d/a-z>]*','','')
hits = re.findall(regex,tagstring)#e.g.['(<60>匄60>)']
#['(<879>令879>/<789>長789>)']
if len(hits) > 0:
#preparation of a version of hits with addinfo removed
remnants = hits[0]
delchar = re.findall('.',adds[b])
for c in range(len(delchar)):
if delchar[c] == '/':remnants = subslash.sub('\g<1>',remnants,1)
else:remnants = re.sub(delchar[c],'',remnants,1)
#checking romoval results
delchar = tools.listduplicationdel(delchar)
for c in delchar:
if c != '/' and c in remnants:logfile.write('more than 1 appearance of character {} in string {},addinfo is '
'{}\n\n'.format(c,hits[0],after))
#removing addinfo from tagged string
tagstring = re.sub(hits[0],remnants,tagstring,1)
else:
prepstring = re.sub('[\da-z>]','',tagstring)
logfile.write('addinfo {} not found in string {}\nprepstring is {}\n\n'.format(adds[b],tagstring,prepstring))
return(tagstring)
def restorebrackets(tagstring,brackets,slipno,logfile):
'''
Parameters
----------
tagstring : str
equivalent to slipnodic[slipno][n][8]
brackets = slipnodic[slipno][n][7][5]
brackets : list
[(before,after)] = list of tuples.
Each tuples contains a pair of string parts before and after adding info
logfile : TextIOWrapper
Returns
-------
tagstring : str
'''
for a in range(len(brackets)-1,-1,-1):
before, after = brackets[a][0],brackets[a][1]
if len(after)> 1:logfile.write('prepstring part after bracketcheck longer than'
' one character in slip {}\nbefore is {}, after is {}\n\n'
.format(slipno,before,after))
hits = re. findall(after,tagstring)
if len(hits) > 1:logfile.write('prepstring part after bracketcheck found {}'
' times in tagstring of slip {}\nbefore is {}, after is {}\n\n'
.format(len(hits),slipno,before,after))
elif len(hits) == 0:logfile.write('prepstring part after bracketcheck not found'
' in tagstring of slip {}\nbefore is {}, after is {}\n\n'
.format(slipno,before,after))
tagstring = re.sub(after,before,tagstring)
return(tagstring)
def restorefracmarks(tagstring,fracinfo):
'''
Restores the original fracture marks.
E.g. 【嬰児某】 → '〼'
Parameters
----------
tagstring : TYPE
DESCRIPTION.
fracinfo : TYPE
DESCRIPTION.
Returns
-------
None.
'''
for n in range(len(fracinfo)):
tagstring = re.sub(fracinfo[n][1],fracinfo[n][0],tagstring,1)
if '【' in tagstring: tagstring = restorefracmarks(tagstring,fracinfo)
return(tagstring)
def divergencecheck(stringori,tagstring,fracinfo):
'''
Removes all tags from tagstring and checks divergence from original string.
Parameters
----------
stringori : str
tagstring : str
Returns
-------
Returns tagstring if not identical to original string and empty string '' if identical.
'''
tagstring = re.sub('?[\d/]+>','',tagstring)
tagstring = re.sub('?[porl][a-z/]+>','',tagstring)
tagstring = restorefracmarks(tagstring,fracinfo)
if stringori == tagstring:return('')
else:return(tagstring)
def stringprepremoval(slipnodic,lemmata,filename):
'''
Restores original string by remove alterations conducted during string preparation,
i.e. restoration of bracket expressions and removal of additional information taken
from standardized string forms
Parameters
----------
slipnodic : dic
{slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
[tagstring],[lemmata]]]}
lemmata : list
[(headword,headst,ID)]
filename : str
Returns
-------
slipnodic : dic
'''
logfile = open(filename,'w',encoding='utf-8-sig')
logfile02 = open('log(tagremovalmanualcorrectionneeded.txt','w',encoding='utf-8-sig')
manualtagging = inputmethods.manualtagginginput('input(manualtagging).txt')
for slipno in slipnodic.keys():
for n in range(len(slipnodic[slipno])):
stringori = slipnodic[slipno][n][0][0]
#checking manually tagged strings
if slipno+stringori in manualtagging.keys():
tagstring = slipnodic[slipno][n][8][0] = manualtagging[slipno+stringori]
else:
#removing info added depending on stringreg
tagstring = slipnodic[slipno][n][8][0]
addinfo = slipnodic[slipno][n][7][5]
tagstring = removeaddinfo(tagstring,addinfo,logfile)
#restoring brackets coming from original string
brackets = slipnodic[slipno][n][7][1]
slipnodic[slipno][n][8][0] = restorebrackets(tagstring,brackets,slipno,logfile)
tagstring = slipnodic[slipno][n][8][0]
#checking convergence with original string
fracinfo = slipnodic[slipno][n][7][3]
tagremovedstr = divergencecheck(stringori,tagstring,fracinfo)
if tagremovedstr:
logfile02.write('Tagged string of slip {} different from original string!\n'
'Original string:{}\ntagged string without tags:{}\ntagged string is{}\n'
.format(slipno,stringori,tagremovedstr,tagstring))
lemmatano = slipnodic[slipno][n][3]
for no in lemmatano:
logfile02.write('{}\t{}\n'.format(no,lemmata[int(no)]))
logfile02.write('\n')
logfile.close()
logfile02.close()
return(slipnodic)
def matchexepttaggedstring(string,text):
'''
Finds all matches of string in text except strings already tagged
and returns them as a list.
Parameters
----------
string : str
text : str
Returns
-------
liste : list
'''
templist = re.findall('(?:(?:?\d+>)|[^\d>])?'+string+'(?:(?:?\d+>)|[^\d>])?',text)
liste = []
[liste.append(x) for x in templist if not re.search(r'<(\d+)>{}\1>'.format(string),x)]
return(liste)
#Replacing the strings in original text by tagged strings
def replacewithfrqcheck(stringori,frqindex,tagstring,slipno,sliptextstring,logfile):
'''
Looks up sakuindata strings in slip text and replaces them with tagged string
In case of inconsistencies in number of occurrences, extended strings will
be applied in order to exclude unnecessary overlaps
Parameters
----------
stringori : str
tagstring : str
frqindex : int
sliptextstring : str
logfile : TextIOWrapper
Returns
-------
None.
'''
#creating list of appliable strings
#using extended strings in order to exclude tagged strings
xstringori = matchexepttaggedstring(stringori,sliptextstring)
#checking frequency inconsistency
frqtext = len(xstringori)
if frqtext > frqindex:#Partial overlap between different stringori, such as 尉守蜀 and 尉 in slip 8-0652a+8-0067a, causes inflation in text matches
logfile.write('frequency inconsistency for {} in slip {} caused by overlapping text\n'
'frequency in index is {} , frequency in text is {}\n'
'Full text is as follows:\n{}\n'
'\n'.format(stringori,slipno,frqindex,
frqtext,sliptextstring))
elif frqtext < frqindex:#string variations between text and index causes mismatches
logfile.write('frequency inconsistency for {} in slip {} caused by mismatching\n'
'frequency in index is {} , frequency in text is {}\n'
'Full text is as follows:\n{}\n'
'\n'.format(stringori,slipno,frqindex,
frqtext,sliptextstring))
for n in range(len(xstringori)):
xtagstring = re.sub(stringori,tagstring,xstringori[n])#tagging extended string
sliptextstring = re.sub(xstringori[n],xtagstring,sliptextstring,1)#replacing string in sliptext with tagged extended string
return(sliptextstring)
def sliptextreplacement(slipnodic,sliptext,filename):
'''
Replaces original string with tagged string and
creates a new slip text data file with tags
Parameters
----------
slipnodic : dic
{slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq]
['brackets',[(strwithbr,strwithoutbr)],
'fractures',[(strwithfrmark,strwithoutfrmark)],
'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],
[tagstring],[lemmata]]]}
sliptext : dic
{slipno:[sliptext,docstyle,posinstyle,posindoc]}
logfile : TextIOWrapper
Returns
-------
sliptexttag : dic
{slipno:[sliptext,docstyle,posinstyle,posindoc]}
'''
sliptexttag = copy.deepcopy(sliptext)
logfile = open(filename,'w',encoding='utf-8-sig')
#Checking slip numbers
slipnodickeys = list(slipnodic.keys())
for slipno in slipnodickeys:
if '~' in slipno:#index data stretch over two slips
tempslipno = slipno.split('~')
if len(slipnodic[slipno]) > 1 :
logfile.write('Straddling data of slip {} has more than one record'.format(slipno))
temprecord = [[],[]]
if tempslipno[0] not in slipnodic.keys():
slipnodic[tempslipno[0]] = []
temprecord[0] = temprecord[1] = copy.deepcopy(slipnodic[slipno][0])
temprecord[0][0][0] = re.sub('#.+$','',temprecord[0][0][0])
temprecord[0][8][0] = re.sub('#.+$','',temprecord[0][8][0])
slipnodic[tempslipno[0]].append(temprecord[0])
if tempslipno[1] not in slipnodic.keys():
slipnodic[tempslipno[1]] = []
temprecord[1][0][0] = re.sub('^[^#]+#','',temprecord[1][0][0])
temprecord[1][8][0] = re.sub('^[^#]+#','',temprecord[1][8][0])
slipnodic[tempslipno[1]].append(temprecord[1])
elif slipno not in sliptexttag.keys():
logfile.write('slip number {} not found in slip text\n'.format(slipno))
#Replacing strings
for slipno in slipnodic.keys():
if '~' in slipno or slipno not in sliptexttag.keys():continue
for n in range(len(slipnodic[slipno])):
stringori,frqindex,tagstring = slipnodic[slipno][n][0][0],slipnodic[slipno][n][4],slipnodic[slipno][n][8][0]
sliptexttag[slipno][0] = replacewithfrqcheck(stringori,frqindex,tagstring,slipno,sliptexttag[slipno][0],logfile)
logfile.close()
return(sliptexttag)
def histdatetagging(histdates,filename):
'''
Creates tags for historical dates and
inserts them in pertinent strings
Parameters
----------
hitsdates : dic
{slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote]]}
filename : TextIOWrapper
Returns
-------
histdates : dic
{slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote,tagstring]]}
'''
logfile = open(filename,'w',encoding='utf-8-sig')
for slipno in histdates.keys():
for n in range(len(histdates[slipno])):
stringreg = histdates[slipno][n][2]
tags = [histdates[slipno][n][4]]
tagobjects = [histdates[slipno][n][2]]
tagstring = inserttag(stringreg,tags,tagobjects,'','',n,slipno,histdates[slipno][n],logfile)
tags = ['histdate']
tagstring = inserttag(tagstring,tags,tagobjects,'(<\d+>)*','(\d+>)*',n,slipno,histdates[slipno][n],logfile)
histdates[slipno][n].append(tagstring)
return(histdates)
def histdatesreplacemnt(histdates,sliptext,filename):
'''
Replaces historical date strings in slip text with tagged string
Parameters
----------
histdates : dic
{slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote,tagstring]]}
sliptext : dic
{slipno:[sliptext,docstyle,posinstyle,posindoc]}
filename : str
Returns
-------
sliptext : dic
{slipno:[sliptext,docstyle,posinstyle,posindoc]}
'''
logfile = open(filename,'w',encoding='utf-8-sig')
for slipno in histdates.keys():
if slipno not in sliptext.keys():
logfile.write('slip number {} not found in slip text data\n\n'.format(slipno))
continue
for n in range(len(histdates[slipno])):
stringori,frqindex,tagstring = histdates[slipno][n][0],histdates[slipno][n][1],histdates[slipno][n][7]
#stringori = tools.searchpreparation(stringori,'[# ☒■]*')
sliptext[slipno][0] = replacewithfrqcheck(stringori,frqindex,tagstring,slipno,sliptext[slipno][0],logfile)
return(sliptext)
if __name__ == "__main__":
zero = 0