# -*- coding: utf-8 -*- """ Created on Sat Apr 27 20:08:25 2024 @author: Arnd Helmut Hafner """ #processes the index data in order to tag the original text import sys # for sys.exit after error import re import tools import copy from re import compile, search import inputmethods #Creation of dictionary of index data accessible by slip number def slipnofrqtest(sliplist): ''' Checking the consistency of frequency numbers in records of slipnodic Parameters ---------- sliplist : list One value of slipnodic [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]] Returns ------- inconsistency : list list of the positions of records that include inconsistencies. ''' inconsistency = [] for n in range(len(sliplist)): for m in range(1,len(sliplist[n][6])): if sliplist[n][6][m] != sliplist[n][6][0]:inconsistency.append(n) if sliplist[n][4] and sliplist[n][6] and sliplist[n][6][0] != sliplist[n][4] and n not in inconsistency: #if there are no items,frq will be zero; if there are no refitems, reffrq will be an empty list. inconsistency.append(n) return(inconsistency) def createdic(lemmata,items,refitems): '''creating a dictionary for access to data by slip number Parameters ---------- lemmata : list #[(headword,headst,ID)] items : list [(stringori,stringreg,lemmatano,slipno,slipnote,notes)] refitems : list [(stringori,stringreg,lemmatano,slipno,slipnote,notes)] Returns ------- slipnodic :dic {slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]} [stringori],[stringreg],[lemmatano] are put in list form in preparation for merging overlapping strings [reflenno],[reffrq] are put in list form because every string appears once under the main lemma and numerous times under different reference lemmata. ''' slipnodic = {} tempdic = {}#remembers the record structure within each value list in slipnodic #Entering item data into slipnodic for n in range(len(items)): stringori,stringreg,lemmatano,slipno,slipnote,notes = items[n] ID = lemmata[lemmatano][2] tempkey = stringori+stringreg+ID if slipno not in slipnodic.keys(): slipnodic[slipno] =[[[stringori],[stringreg],ID,[lemmatano],1,[],[]]] tempdic[slipno] = {} tempdic[slipno][tempkey] = 0 #using stringori+stringreg+ID as key because same original string #can be assigned two different standardized forms. E.g.: #司空、尉主→尉主 and 司空、尉主→司空主 #佐蒲、就→(啓陵郷)佐蒲 and 佐蒲、就→(啓陵郷)佐就 #Additionally, identical strings can be appear simultaneously in different indices else: if tempkey not in tempdic[slipno].keys(): tempdic[slipno][tempkey] = len(slipnodic[slipno]) slipnodic[slipno].append([[stringori],[stringreg],ID,[lemmatano],1,[],[]]) elif slipnodic[slipno][tempdic[slipno][tempkey]][3][0] ==lemmatano: slipnodic[slipno][tempdic[slipno][tempkey]][4] += 1 #tempdic[slipno][tempkey]: fieldnumber #slipnodic[slipno][tempdic[slipno][tempkey]][2][0]:lemmatano else: sys.exit('Error in item loop of createdic\n{}'.format(items[n])) #entering refitem data into slipnodic for n in range(len(refitems)): stringori,stringreg,lemmatano,slipno,slipnote,notes = refitems[n] ID = lemmata[lemmatano][2] tempkey = stringori+stringreg+ID if slipno not in slipnodic.keys(): slipnodic[slipno] =[[[stringori],[stringreg],ID,[],0,[lemmatano],[1]]] tempdic[slipno] = {} tempdic[slipno][tempkey] = 0 else: if tempkey not in tempdic[slipno].keys(): tempdic[slipno][tempkey] = len(slipnodic[slipno]) slipnodic[slipno].append([[stringori],[stringreg],ID,[],0,[lemmatano],[1]]) elif lemmatano in slipnodic[slipno][tempdic[slipno][tempkey]][5]: i = tools.lookuplistitem(lemmatano,slipnodic[slipno][tempdic[slipno][tempkey]][5]) i = i[0] slipnodic[slipno][tempdic[slipno][tempkey]][6][i] += 1 else: slipnodic[slipno][tempdic[slipno][tempkey]][5].append(lemmatano) slipnodic[slipno][tempdic[slipno][tempkey]][6].append(1) #Sorting slipnodic in declining order of length of stringori #Will be necessary when checking overlaps between strings logfile = open('logcreatdict(frequencycheck).txt','w',encoding='utf-8-sig') for slipno in slipnodic.keys(): slipnodic[slipno] = sorted(slipnodic[slipno] , key=lambda x: len(re.findall('.',x[0][0]))*-1) #checking appearance frequencies inconsistency = slipnofrqtest(slipnodic[slipno]) for n in inconsistency:logfile.write('inconsistency in record {} of slip {}\n{}\n'.format(n,slipno,slipnodic[slipno][n])) #merging frqdata (after above check didn't show any inconsistency) for n in range(len(slipnodic[slipno])): if not slipnodic[slipno][n][4]: slipnodic[slipno][n][4] = slipnodic[slipno][n][6][0] return(slipnodic) #Preliminary consistency check def simplefrqcheck(slipno,recordno,logfile): # pass def simpleconsistencycheck(slipnodic,sliptext,filename): ''' Checks data consistency by comparing appearance of index data in slip text data and outputs the result to a logfile Parameters ---------- slipnodic : dict {slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]} sliptext : dict {slipno:[sliptext,docstyle,posinstyle,posindoc]} filename : TestIOWrapper Ouput ------- inconsistencies : a)slip numbers not found in slip text data b)strings with more or less appearances in the text than in the indices ''' logfile = open(filename,'w',encoding='utf-8-sig') for slipno in slipnodic.keys(): if slipno not in sliptext.keys(): logfile.write('slip number {} not found in sliptext\n'.format(slipno)) continue for n in range(len(slipnodic[slipno])): #frq = simplefrqcheck(slipno,recordno,logfile) stringori = slipnodic[slipno][n][0][0] searchstring = tools.searchpreparation(stringori,'[┘## 〼■(?)、,]*') frqindex = slipnodic[slipno][n][4] frqtext = len(re.findall(searchstring,sliptext[slipno][0])) if frqtext == frqindex:continue logfile.write('frequency inconsistency for {} in slip {}\n' 'frequency in index is {} , frequency in text is {}\n' 'Full text is as follows:\n{}\n' '\n'.format(stringori,slipno,frqindex, frqtext,sliptext[slipno][0])) logfile.close() return #Original text data string def stringorireplacement(slipnodic,sliptext,filename): ''' Replaces index data strings with text data strings Stringori is already a simplified version of the original text data strings. Thus, strings must be reversed into their original form in order to guarantee accurate tagging of original text data Parameters ---------- slipnodic : dict {slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]} sliptext : dict {slipno:[sliptext,docstyle,posinstyle,posindoc]} filename : TestIOWrapper Returns ------- slipnodic : dict stringori is replaced by correspondent string in original slip text ''' logfile = open(filename,'w',encoding='utf-8-sig') for slipno in slipnodic.keys(): if '~' in slipno:#index data stretch over two slips #creating a temporary sliptext out of the #last 7 characters of slip1 and the first 7 characters of slip2 temp = slipno.split('~') sliptexttemp = re.findall('[^#]{1,7}#[^#]{1,7}',sliptext[temp[0]][0] + '#' + sliptext[temp[1]][0]) sliptexttemp = sliptexttemp[0] elif slipno not in sliptext.keys(): logfile.write('Slip number {} not found in slip text\n\n'.format(slipno)) continue else:sliptexttemp = sliptext[slipno][0] newrecords = [] for n in range(len(slipnodic[slipno])): #finding strings in text stringori = slipnodic[slipno][n][0][0] searchstring = tools.searchpreparation(stringori,'[┘## 〼■、,()]*?') if len(re.findall(stringori,sliptexttemp)) == len(re.findall(searchstring,sliptexttemp)): hitsinsliptext = tools.listitemcount(re.findall(stringori,sliptexttemp)) else:#In the case that correct matching is impeded by special characters hitsinsliptext = tools.listitemcount(re.findall(searchstring,sliptexttemp)) if not hitsinsliptext: logfile.write('String {} not found in slip {}\n slip text is:\n{}\n\n' .format(stringori,slipno,sliptexttemp)) continue ID = slipnodic[slipno][n][2] frqindex = slipnodic[slipno][n][4] #elimination of unrelated strings ( = matches with longer strings from other indices) if len(hitsinsliptext) > 1: for m in range(n-1,-1,-1): for strings in hitsinsliptext.keys(): if strings in slipnodic[slipno][m][0][0] and ID != slipnodic[slipno][m][2]: hitsinsliptext[strings] -= slipnodic[slipno][m][4] hitsinsliptext = {key: val for key, val in hitsinsliptext.items() if val != 0} if not hitsinsliptext:#already non-existent logfile.write('frequency inconsistency for {} in slip {} after first overlap check\n' 'no string left\n' 'Full text is as follows:\n{}\n\n' '\n'.format(stringori,slipno,sliptexttemp)) for strings in hitsinsliptext.keys(): if hitsinsliptext[strings] < 0:#already non-existent logfile.write('frequency inconsistency for {} in slip {} after first overlap check\n' 'frequency of {} dropped below 0\n' 'Full text is as follows:\n{}\n\n' '\n'.format(stringori,slipno,strings,sliptexttemp)) itemlist = list(hitsinsliptext.keys()) #replacing strings slipnodic[slipno][n][0][0] = itemlist[0] slipnodic[slipno][n][4] = frqtext = hitsinsliptext[itemlist[0]] if len(itemlist) == 1: #len(itemlist) == 1 means that there is only one variation of the string in the text #dealing with frequency inconsistencies if frqindex == frqtext: continue elif frqindex < frqtext: #searching for overlaps in the same index frqprocessed = frqtext for m in range(n-1,-1,-1): if stringori in slipnodic[slipno][m][0][0] and ID == slipnodic[slipno][m][2]: frqprocessed -= slipnodic[slipno][m][4] if frqindex != frqprocessed: logfile.write('frequency inconsistency for {} in slip {} after second overlap check\n' 'frequency in index is {} , frequency in text originally is {}, after processing is {}\n' 'Full text is as follows:\n{}\n\n' '\n'.format(stringori,slipno,frqindex,frqtext, frqprocessed,sliptexttemp)) elif frqindex > frqtext: #These occurrences have proved to be caused by errors in original data. #No further occurences after correction of data logfile.write('frequency inconsistency for {} in slip {}, more occurrences in index than in text\n' 'frequency in index is {} , frequency in text is {}\n' 'Full text is as follows:\n{}\n\n' '\n'.format(stringori,slipno,frqindex, frqtext,sliptexttemp)) #dealing with differences in string representation in slip text and frequency inconsistencies else:#dealing with different variations of the string frqprocessed = frqindex frqprocessed -= frqtext for x in range(1,len(itemlist)): nrecord = copy.deepcopy(slipnodic[slipno][n]) nrecord[0][0] = itemlist[x] frqtext = hitsinsliptext[itemlist[x]] nrecord[4] = frqtext frqprocessed -= frqtext newrecords.append((nrecord,n)) if frqprocessed != 0: logfile.write('frequency inconsistency for {} in slip {} after dealing with different representations\n' 'frequency in index originally is {}, after processing is {}, frequency in text is {}\n' 'Full text is as follows:\n{}\n\n' '\n'.format(stringori,slipno,frqindex,frqprocessed, frqtext,sliptexttemp)) slipnodic[slipno] = tools.listaddrecords(newrecords,slipnodic[slipno]) return(slipnodic) #stringprocessing. Bridging the gap between information stored in original strings #and information stored in standardized strings def bracketremoval(prepstring,beforeafter,regex,beforegroup,aftergroup,delinbefore = '',delinafter = ''): ''' Removes brackets and other special symbol from string, and accumulates removal information in form of tuples of string parts before and after Parameters ---------- prepstring : str Initially equivalent to slipnodic[slipno][n][0] = stringori beforeafter : list list of string parts before and after removal of special characters before = string part that is matched in order to remove special characters after = string part after removal regex : str Regular that hits special characters beforegroup :tuple tuple of numbers that indicate the match groups that are used as string part before aftergroup : tuple tuple of numbers that indicate the match groups that are used as string part after delinbefore : str, optional Special characters that need to be extraordinarily deleted in string part before. The default is ''. delinafter : str, optional Special characters that need to be extraordinarily deleted in string part before. The default is ''. Returns ------- prepstring,beforeafter. ''' bcheck = re.finditer(regex,prepstring) for a in bcheck: before = '' for n in beforegroup: if a.group(n):before += a.group(n) before = re.sub(delinbefore,'',before) after = '' for n in aftergroup: if a.group(n):after += a.group(n) after = re.sub(delinafter,'',after) prepstring = re.sub(before,after,prepstring,1) beforeafter.append((before,after)) return(prepstring,beforeafter) def bracketcheck(prepstring): ''' Removes brackets and some other symbols from original string and returns tuples of string parts before and string parts after processing Parameters ---------- prepstring : str Returns ------- beforeafter : list list of tuples, entailing tuples of relevant string parts in the form before and after processing ''' beforeafter = [] #Finding "[字]" = 衍字 = redundant character. Needs surrounding characters to re-identify location after deletion regex = '(.)?([({[^}]+}|.)])(.)?' prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(1,4),'','?') #finding "字(字)" . Contamination by "?" doesn't impede matching. #({[^}]+}|[^}﹦〗]) hits unicode external characters.{[^}]+} especially aims at code external #characters; [^}] avoids double match on last bracket of external characters, and [^﹦〗] #avoids other special characters regex = '({[^}]+}|[^}﹦〗])([(〔]??)([^?\-)〕]+)(??[)〕])' prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(3,),'','?') #Finding ""字(字-字)". Contamination by "?" doesn't impede matching. regex = '({[^}]+}|[^}﹦〗])([(〔]??[^?]\-)([^?])(??[)〕])' prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(3,),'','?') #Finding "字(?)" regex = '({[^}]+}|[^>])((?))' prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(1,)) #Finding ")字" regex = '()([^<]+)()'#囲い文字を検出 prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(2,)) #duplication symbols if re.search('﹦.﹦',prepstring): #For now, actually only one case:"琅﹦邪﹦守" in slip 8-0657a regex = '([^﹦])﹦' prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(1,),'[]') elif '﹦' in prepstring: regex = '(.)([〖]*﹦[〗]*()([^)]+)())' prepstring,beforeafter = bracketremoval(prepstring,beforeafter,regex,(0,),(3,),'[]') return(prepstring,beforeafter) def fracturecheck(prepstring,stringreg,logfile): ''' Reads information about text lost by slip fractures and writes them into prepstring. This information needs to be kept stored after tagging Parameters ---------- prepstring : str stringreg : str Returns ------- prepstring : str beforeafter : list a list of tuples ''' beforeafter = [] fracori = re.findall('〼  〼|〼……|……〼|……|〼', prepstring) fracreg = re.findall('【[^】]+】', stringreg) if len(fracori) == len(fracreg): for a in range(len(fracori)): beforeafter.append((fracori[a],fracreg[a])) prepstring = re.sub(fracori[a],fracreg[a],prepstring) else: if '/' in stringreg: #During index data consolidation, two stringreg have been merged for some records #resulting in "stringreg1/stringreg2 #The next three lines split stringreg again, sort them in declining order of length #and pick out the longest one stringreg = stringreg.split('/') stringreg = sorted(stringreg , key=lambda x: len(re.findall('.',x))*-1) stringreg = stringreg[0] prepstring,beforeafter = fracturecheck(prepstring,stringreg,logfile) else: logfile.write('fracture not reflected correctly: {}\t{}\n'.format(prepstring,stringreg)) return(prepstring,beforeafter) def addinfocheck(prepstring,stringreg,beforeafter = []): ''' Reads the additional information that is stored in standardized strings in form of bracketed annotations. This information can't kept stored in the tagged text. Deletion, however, will create empty tags. Parameters ---------- prepstring : str stringreg : str beforeafter : list,optional default vale = []; can pass on values in case of recursive use of function Returns ------- prepstring : str beforeafter : list a list of tuples ''' #Part of additional information is linked to information on slip fracture #This is already processed and needs to be exempted here stringreg = re.sub('【[^(】]*([^】]+】','',stringreg) if re.search('^[^(]*([^)/]+)[^(/]*/[^(/]*([^)/]+)[^(]*$',stringreg): #Hits multiple stringreg expressions "stringreg1/stringreg2". #Doesn't hit "沅陽(令/長)" etc. #"stringreg1/stringreg2" is split into single stringreg and feeded into recursive call of function temp = stringreg.split('/') for stringreg in temp: prepstring,beforeafter = addinfocheck(prepstring,stringreg,beforeafter) elif '(' in stringreg: #Two different patterns of additional information #abc(def)ghi: Only one add info string. def = add info m1 = re.search('^([^(]*)(([^)]+))([^(]*)$',stringreg) #abc(def)ghi(jkl)mno: Two add info strings. def,jkl = add infos m2 = re.search('(([^)]+))([^(]+)(([^)]+))',stringreg)#付加情報が二つある場合 #m2 also hits "御史(大夫)/(監)御史". Order of application changes results! if m1: #check for misinterpretation #m1.group(2) in prepstring:add info already entailed in original string #m1.group(1/3) not in prepstring: location to add information not specified if m1.group(1) not in prepstring and m1.group(3) not in prepstring: sys.exit('Failure in addinfocheck m1\nprepstring is:{}\nstringreg is {}'.format(prepstring,stringreg)) elif m1.group(1): before = m1.group(1)[-1]#the last character of string part before add info after = m1.group(1)[-1] + m1.group(2) elif m1.group(3): before = m1.group(3)[0]#the first character of string part behind add info after =m1.group(2) + m1.group(3)[0] if beforeafter and after in beforeafter[0]: pass else: beforeafter.append((before,after)) prepstring = re.sub(before,after,prepstring) elif m2: #check for misinterpretations #m2.group(2) == '':no characters between two add infos→location to add information not specified #m2.group(2) not in prepstring:location to add information not specified if m2.group(2) == '' or m2.group(2) not in prepstring: sys.exit('Failure in addinfocheck m2\nprepstring is:{}\nstringreg is {}'.format(prepstring,stringreg)) before = m2.group(2) after = m2.group(1) + m2.group(2) +m2.group(3) beforeafter.append((before,after)) prepstring = re.sub(before,after,prepstring) else: sys.exit('mismatch in addinfocheck:{}\t{}\n'.format(prepstring,stringreg)) return(prepstring,beforeafter) def stringprep(slipnodic,filename): ''' Prepares strings for tag matching. Removes or adjusts special symbols in the original string Extracts extra information stored in standardized string forms Parameters ---------- slipnodic : Dic {slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq]]]} filename : TYPE Returns ------- slipnodic : dic {slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], 'prepstring']]} ''' logfile = open(filename,'w',encoding='utf-8-sig') for slipno in slipnodic.keys(): for n in range(len(slipnodic[slipno])): slipnodic[slipno][n].append(['brackets',[],'fractures',[],'addinfo',[]]) prepstring = slipnodic[slipno][n][0][0] #brackets prepstring,slipnodic[slipno][n][7][1] = bracketcheck(prepstring) #fracture symbols stringreg = slipnodic[slipno][n][1][0] prepstring,slipnodic[slipno][n][7][3] = fracturecheck(prepstring,stringreg,logfile) prepstring,slipnodic[slipno][n][7][5] = addinfocheck(prepstring,stringreg,[]) slipnodic[slipno][n].append(prepstring) return(slipnodic) #Consistency checks and consolidation of data def reflectaddinfo(addinfolist,prepstring): ''' Reflecting add info stored in addinfolist on prepstring. Used for two situations: Equal merge of strings like "司空、尉主" or "倉、司空主". Prepstring differ like "司空、尉主" and "司空(司空)、尉主" or "倉(司空)、司空主" and "倉、司空(司空)主" Acquisition merger of strings like "其一人為田鼂養:成" and "田(嗇夫)鼂" Original string differs because of different edition policies in different indices Parameters ---------- addinfolist : TYPE DESCRIPTION. prepstring : TYPE DESCRIPTION. Returns ------- None. ''' for n in range (len(addinfolist)): strwithoutaddinfo,strwithaddinfo = addinfolist[n] if strwithoutaddinfo in prepstring and strwithaddinfo not in prepstring: prepstring = re.sub(strwithoutaddinfo,strwithaddinfo,prepstring) return(prepstring) def concordancecheck(datalist): ''' Searches for complete concordances between strings within the same single value of slinodic Parameters ---------- datalist : list [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq], ['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],'prepstring']]} Returns ------- concordances : list [(a,b)] list of tuples. Every tuple entails the position of two records with concordant strings. a is to be incorporated into b. ''' #finding concordant records concordances = [] for a in range(len(datalist)-1,0,-1): stringoria = datalist[a][0][0] ##precaution "(充)令" and "(有酉)令" will be mistakenly taken as concordant #Using prepstring leads to confusion of "□季(?)" and "□季" etc #and ommission of "司空、尉主" and "倉、司空主" etc. for b in range(a-1,-1,-1): stringorib = datalist[b][0][0] if stringoria == stringorib: concordances.append((a,b)) #deleting abundant record pairs #5 4 #5 3 #4 3 #→5 3/5 4 needs to be processed only once for a in range(len(concordances)-2,-1,-1): if concordances[a][0] == concordances[a+1][0]:del concordances[a] return(concordances) def equalmerge(positions,datalist,slipno,logfile): ''' Merges two records within a single value of slipnodic. Checks frequencies before merging. Parameters ---------- positions : tuple (a,b) = a is to be absorbed in b datalist : list equivalent to slipnodic[slipno] [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq], ['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],'prepstring']]} [stringori],[stringreg],[lemmatano] still have only one value each. logfile : TextIOWrapper Returns ------- datalist : list ''' a,b = positions #reconfirmation if datalist[a][0] != datalist[b][0]: logfile.write('Inconsistency of original string form in equalmerge for record {} and {} of slip {}\n{}\n' '\n'.format(b,a,slipno,datalist)) #Merging data #prepstring if datalist[a][8] != datalist[b][8]: #checking for addinfo in a not reflected in b #e.g."司空、尉主" and "司空、尉主" datalist[b][8] = reflectaddinfo(datalist[a][7][5],datalist[b][8]) if datalist[a][8] != datalist[b][8]: logfile.write('Inconsistency of prepstring in equalmerge for record {} and {} of slip {}\n{}\n' '\n'.format(b,a,slipno,datalist)) if datalist[a][1] != datalist[b][1]: datalist[b][1][0] += '/' + datalist[a][1][0] datalist[b][2] += '/' + datalist[a][2] if datalist[a][3] and datalist[b][3]: datalist[b][3] += datalist[a][3] datalist[b][5] += datalist[a][5] datalist[b][6] += datalist[a][6] fields = (1,3,5) for n in fields: datalist[b][7][n] += datalist[a][7][n] #checking frequency and deleting record a if datalist[a][4] and datalist[a][4] == datalist[b][4]:del datalist[a] elif not datalist[a][4] and datalist[a][6][0] == datalist[b][6][0]:del datalist[a] elif datalist[a][4] > datalist[b][4]:datalist[a][4] -= datalist[b][4] elif datalist[a][6] and datalist[a][6][0] > datalist[b][6][0]:datalist[a][6][0] -= datalist[b][6][0] else:logfile.write('Frequency Inconsistency in equalmerge for record {} and {} of slip {}\n{}\n' '\n'.format(b,a,slipno,datalist)) return(datalist) def overlapcheck(datalist): ''' Searches for partial overlaps between strings within the same single value of slinodic. Only feasible after elimination of complete concordances. Parameters ---------- datalist : list equivalent to slipnodic[slipno] [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq], ['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],'prepstring']] [stringori],[stringreg],[lemmatano] still have only one value each. Returns ------- datalist : list ''' overlapping = {} for a in range(1,len(datalist)): stringoria = datalist[a][0][0] prepstringa = datalist[a][8] #Using prepstring leads to confusion of "□季(?)" and "□季" etc IDa = datalist[a][2] for b in range(0,a): stringorib = datalist[b][0][0] prepstringb = datalist[b][8] IDb = datalist[b][2] if (stringoria in stringorib or prepstringa in prepstringb) and IDa not in IDb: if b in overlapping.keys(): overlapping[b].append(a) else: overlapping[b] = [a] return(overlapping) def acquisitionmerge(positions,datalist,slipno,delrecords,logfile): ''' Absorbs records that entail strings concordant to parts of other strings into their records Parameters ---------- positions : tuple (a,b) = a is to be absorbed in b datalist : list equivalent to slipnodic[slipno] [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq], ['brackets',[(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]],[prepstring]]]} [stringori],[stringreg],[lemmatano] still have only one value each. slipno : str delrecords : list Deletion of unnecessary records would might lead to changes of positions and mis-recognition. Therefore, positions of deletable records are accumulated and used for deletion after finishing of all merging processes logfile : TextIOWrapper Returns ------- datalist : list [stringori],[stringreg],[lemmatano] obtained multiple values. delrecords: list ''' a,b = positions #Absorbing [stringori],[stringreg],ID,[lemmatano],[prepstring] of a into b datalist[b][8][0] = reflectaddinfo(datalist[a][7][5],datalist[b][8][0]) fields = (0,1,2,3,5,8) for n in fields: datalist[b][n] +=datalist[a][n] #Absorbing [(strwithbr,strwithoutbr)],'fractures',[(strwithfrmark,strwithoutfrmark)],'addinfo',[(strwithoutaddinfo,strwithaddinfo)]] fields = (1,3,5) for n in fields: datalist[b][7][n] += datalist[a][7][n] #Adjustment of frequency numbers if datalist[a][4] : frqa = datalist[a][4] else: frqa = datalist[a][6][0] if datalist[b][4] : frqb = datalist[b][4] else: frqb = datalist[b][6][0] frqainb = len(re.findall(datalist[a][0][0],datalist[b][0][0])) frqainbprp = len(re.findall(datalist[a][8][0],datalist[b][8][0])) if frqainb < frqainbprp:frqainb = frqainbprp frqa = frqa-(frqb*frqainb) if frqa == 0 and a not in delrecords: delrecords.append(a) elif frqa > 0 and datalist[a][4]:datalist[a][4] = frqa elif frqa > 0:datalist[a][6][0] = frqa else:logfile.write('frequency inconsistency in acquisitionmerge for record {} ' 'and {} in slip {}\n{}\n\n'.format(b,a,slipno,datalist)) return(datalist,delrecords) def indexdataconsolidation(slipnodic,filename): ''' Searches and merges concordant and overlapping records within individual values of slipnodic Parameters ---------- slipnodic : dict {slipno : [[[stringori],[stringreg],ID,[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], 'prepstring']]} filename : str name of logfile Returns ------- slipnodic : dict {slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], [prepstring]]]} ''' logfile = open(filename,'w',encoding='utf-8-sig') for slipno in slipnodic.keys(): #Checking for concordances concordances = concordancecheck(slipnodic[slipno]) #merging concordant records for n in range(len(concordances)): slipnodic[slipno] = equalmerge(concordances[n],slipnodic[slipno],slipno,logfile) #Checking for overlaps overlapping = overlapcheck(slipnodic[slipno]) #Changing IDs and preprstrings into lists fields = (2,8) for n in range(len(slipnodic[slipno])): for m in fields: slipnodic[slipno][n][m] = [slipnodic[slipno][n][m]] #Absorbing overlapping records delrecords = [] for b in sorted(overlapping.keys()): for a in overlapping[b]: slipnodic[slipno],delrecords = acquisitionmerge((a,b),slipnodic[slipno],slipno,delrecords,logfile) delrecords.sort(reverse=True) for n in range (len(delrecords)):del slipnodic[slipno][delrecords[n]] #deleting duplications in stringprep info (brackets,fracures,addinfo) fields = (1,3,5) for n in range(len(slipnodic[slipno])): for m in fields: slipnodic[slipno][n][7][m] = tools.listduplicationdel(slipnodic[slipno][n][7][m]) return(slipnodic) #tagpreparation def tagpreplookuplemmata(lemmatano,lemmata): ''' Looks up lemmata from lammata list by lemmata number and creates temporary dic for connecting lemmata with lemmata number. Lemmata number will be necessary for creation of tags Parameters ---------- lemmatano : Int lemmata : list [(headword,headst,ID)] Returns ------- lemmatainstr : list List of lemmata entailed in the string temp : dic {lemma:[lemmatano]} necessary in order to create tags after sorting lemmata by length ''' lemmatainstr = [] temp = {} #extracting lemmata for n in range(len(lemmatano)): lemma = lemmata[lemmatano[n]][1] if lemma not in temp.keys(): temp[lemma] = [lemmatano[n]] lemmatainstr.append(lemma) elif temp[lemma] != lemmatano[n]: temp[lemma].append(lemmatano[n]) #sorting lemmata in descending order of length lemmatainstr = sorted(lemmatainstr , key=lambda x: len(re.findall('.',x))*-1) return(lemmatainstr,temp) def sortlemmatano(lemmatainstr,temp): ''' Sorts lemmata numbers in the order of lemmatainstr and returns them as a list of tag numbers Parameters ---------- lemmatainstr : list List of lemmata related to string temp : dic {lemma:[lemmatano]} Returns ------- tags : list ['lemmatano'] In case of different lemmata numbers for lemmata with identical standardized writing: 'lemmatano1/lemmatano2' ''' numbers = [] for n in range(len(lemmatainstr)): numbers.append('') lemma = lemmatainstr[n] for lemmatano in temp[lemma]: if str(lemmatano) not in numbers[n]: numbers[n] += str(lemmatano) + '/' numbers[n] = re.sub('/$','',numbers[n]) return(numbers) def tagprep(slipnodic,lemmata,filename): ''' looks up lemmata related to the string, sort them in descending order of length and creates tags Parameters ---------- slipnodic : dic {slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], [prepstring]]]} [lemmatano] in order of appearance in index data lemmata : list [(headword,headst,ID)] filename : TextIOWrapper Returns ------- Slipnodic : dic {slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], [prepstring],[lemmata]]]} [lemmatano] in order of [lemmata] [lemmata] in descending order of length ''' for slipno in slipnodic.keys(): for a in range(len(slipnodic[slipno])): #looking up lemmata and adding them to slipnodic lemmatano = slipnodic[slipno][a][3] + slipnodic[slipno][a][5] lemmatainstr,temp = tagpreplookuplemmata(lemmatano,lemmata) slipnodic[slipno][a].append(lemmatainstr) #resorting lemmata numbers in order of lemmatainstr lemmatano = sortlemmatano(lemmatainstr,temp) slipnodic[slipno][a][3] = lemmatano return(slipnodic) #tagging def inserttag(prepstring,tags,tagobjects,prefix,suffix,n,slipno,record,logfile): ''' Inserts tag into prepstr, enclousing taggablestrings Parameters ---------- prepstring : str slipnodic[slipno][n][8][0] tags : list tagobjects : list prefix : str if '(<\d+>)*', tagged string part will include preceding start tags if '', tagged string part won't include any preceding start tags suffix : str if '()*', tagged string part will include preceding end tags if '', tagged string part won't include any preceding end tags ---below is only for logfile use--- n : int number of record in slipnodic[slipno] slipno : str record : list slipnodic[slipno][n] logfile : TextIOWrapper Returns ------- pepstring ''' if len(tags) != len(tagobjects):sys.exit('Number of tagobjects not equal to number' 'of tagobjects numbers in record {} of slip {}\ntags are {}\n{}\n\n'.format(n,slipno,tags,record)) for a in range(len(tagobjects)): searchstring = tools.searchpreparation(tagobjects[a],'[┘## 〼■、,a-z()]*',prefix,suffix) #searchpreparationは本来[<\d/>()]*で充分であるが、 stringprepにおいてsliptextに基づいて改めた #stringoriには[┘## 〼■、,a-z()]によって検出された特殊記号が含まれるから、ここも #それを正規表記に含めなければならない。 #なお、本ファンクションにtagobjectsとして渡されるstringorisは実際はstringregに基づいており、 #stringprepによる変更を反映していない。本来はそこでstringregを改めるか、 stringorisにstringoriを #用いるべきであるが、諸般の事情でそれができない。 m = re.search(searchstring,prepstring) if m: before = m.group(0) hits = re.findall(before,prepstring) if len(hits) > 1:logfile.write('tagobject {} found more than one time' ' in prepstring {} for slip {}' .format(before,prepstring,slipno)) elif len(hits) == 0:logfile.write('tagobject {} not found in prepstring {} for slip {}' .format(before,prepstring,slipno)) after = '<' + str(tags[a]) + '>' + before + '' prepstring = re.sub(before,after,prepstring) else: logfile.write('mismatch for lemma {} in slip {}\nfull record:\n{}' '\n\n'.format(tagobjects[a],slipno,record)) return(prepstring) def tagging(slipnodic,filename): ''' Tags prepstr according to list of lemmata Parameters ---------- slipnodic : dic {slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], [prepstring],[lemmata]]]} filename : str Returns ------- slipnodic : dic {slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], [tagstring],[lemmata]]]} prepstring is replaced by tagstring (tagged string) ''' logfile = open(filename,'w',encoding='utf-8-sig') for slipno in slipnodic.keys(): for n in range(len(slipnodic[slipno])): lemmatano,lemmata = slipnodic[slipno][n][3],slipnodic[slipno][n][9] slipnodic[slipno][n][8][0] = inserttag(slipnodic[slipno][n][8][0],lemmatano,lemmata,'','',n,slipno,slipnodic[slipno][n],logfile) IDs,stringoris = slipnodic[slipno][n][2],slipnodic[slipno][n][1] slipnodic[slipno][n][8][0] = inserttag(slipnodic[slipno][n][8][0],IDs,stringoris,'(<\d+>)*','()*',n,slipno,slipnodic[slipno][n],logfile) logfile.close return(slipnodic) #Restoration of original string = Removal of string preparations def removeaddinfo(tagstring,addinfo,logfile): ''' Removes additional info originally deriving from bracketed info in stringreg Parameters ---------- tagstring : str equivalent to slipnodic[slipno][n][8] addinfo = slipnodic[slipno][n][7][5] addinfo : list [(before,after)] = list of tuples. Each tuples contains a pair of string parts before and after adding info logfile : TextIOWrapper Returns ------- tagstring : str ''' subslash = compile('([^<])/') for a in range(len(addinfo)-1,-1,-1): after = addinfo[a][1]#e.g."(啓陵)郵人(匄)","道(令/長)" adds = re.findall('([^)]+)',after)#e.g.[(啓陵),(匄)],[(令/長)] for b in range(len(adds)): regex = tools.searchpreparation(adds[b],'[<\d/a-z>]*','','') hits = re.findall(regex,tagstring)#e.g.['(<60>匄)'] #['(<879>令/<789>長)'] if len(hits) > 0: #preparation of a version of hits with addinfo removed remnants = hits[0] delchar = re.findall('.',adds[b]) for c in range(len(delchar)): if delchar[c] == '/':remnants = subslash.sub('\g<1>',remnants,1) else:remnants = re.sub(delchar[c],'',remnants,1) #checking romoval results delchar = tools.listduplicationdel(delchar) for c in delchar: if c != '/' and c in remnants:logfile.write('more than 1 appearance of character {} in string {},addinfo is ' '{}\n\n'.format(c,hits[0],after)) #removing addinfo from tagged string tagstring = re.sub(hits[0],remnants,tagstring,1) else: prepstring = re.sub('[]','',tagstring) logfile.write('addinfo {} not found in string {}\nprepstring is {}\n\n'.format(adds[b],tagstring,prepstring)) return(tagstring) def restorebrackets(tagstring,brackets,slipno,logfile): ''' Parameters ---------- tagstring : str equivalent to slipnodic[slipno][n][8] brackets = slipnodic[slipno][n][7][5] brackets : list [(before,after)] = list of tuples. Each tuples contains a pair of string parts before and after adding info logfile : TextIOWrapper Returns ------- tagstring : str ''' for a in range(len(brackets)-1,-1,-1): before, after = brackets[a][0],brackets[a][1] if len(after)> 1:logfile.write('prepstring part after bracketcheck longer than' ' one character in slip {}\nbefore is {}, after is {}\n\n' .format(slipno,before,after)) hits = re. findall(after,tagstring) if len(hits) > 1:logfile.write('prepstring part after bracketcheck found {}' ' times in tagstring of slip {}\nbefore is {}, after is {}\n\n' .format(len(hits),slipno,before,after)) elif len(hits) == 0:logfile.write('prepstring part after bracketcheck not found' ' in tagstring of slip {}\nbefore is {}, after is {}\n\n' .format(slipno,before,after)) tagstring = re.sub(after,before,tagstring) return(tagstring) def restorefracmarks(tagstring,fracinfo): ''' Restores the original fracture marks. E.g. 【嬰児某】 → '〼' Parameters ---------- tagstring : TYPE DESCRIPTION. fracinfo : TYPE DESCRIPTION. Returns ------- None. ''' for n in range(len(fracinfo)): tagstring = re.sub(fracinfo[n][1],fracinfo[n][0],tagstring,1) if '【' in tagstring: tagstring = restorefracmarks(tagstring,fracinfo) return(tagstring) def divergencecheck(stringori,tagstring,fracinfo): ''' Removes all tags from tagstring and checks divergence from original string. Parameters ---------- stringori : str tagstring : str Returns ------- Returns tagstring if not identical to original string and empty string '' if identical. ''' tagstring = re.sub('','',tagstring) tagstring = re.sub('','',tagstring) tagstring = restorefracmarks(tagstring,fracinfo) if stringori == tagstring:return('') else:return(tagstring) def stringprepremoval(slipnodic,lemmata,filename): ''' Restores original string by remove alterations conducted during string preparation, i.e. restoration of bracket expressions and removal of additional information taken from standardized string forms Parameters ---------- slipnodic : dic {slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], [tagstring],[lemmata]]]} lemmata : list [(headword,headst,ID)] filename : str Returns ------- slipnodic : dic ''' logfile = open(filename,'w',encoding='utf-8-sig') logfile02 = open('log(tagremovalmanualcorrectionneeded.txt','w',encoding='utf-8-sig') manualtagging = inputmethods.manualtagginginput('input(manualtagging).txt') for slipno in slipnodic.keys(): for n in range(len(slipnodic[slipno])): stringori = slipnodic[slipno][n][0][0] #checking manually tagged strings if slipno+stringori in manualtagging.keys(): tagstring = slipnodic[slipno][n][8][0] = manualtagging[slipno+stringori] else: #removing info added depending on stringreg tagstring = slipnodic[slipno][n][8][0] addinfo = slipnodic[slipno][n][7][5] tagstring = removeaddinfo(tagstring,addinfo,logfile) #restoring brackets coming from original string brackets = slipnodic[slipno][n][7][1] slipnodic[slipno][n][8][0] = restorebrackets(tagstring,brackets,slipno,logfile) tagstring = slipnodic[slipno][n][8][0] #checking convergence with original string fracinfo = slipnodic[slipno][n][7][3] tagremovedstr = divergencecheck(stringori,tagstring,fracinfo) if tagremovedstr: logfile02.write('Tagged string of slip {} different from original string!\n' 'Original string:{}\ntagged string without tags:{}\ntagged string is{}\n' .format(slipno,stringori,tagremovedstr,tagstring)) lemmatano = slipnodic[slipno][n][3] for no in lemmatano: logfile02.write('{}\t{}\n'.format(no,lemmata[int(no)])) logfile02.write('\n') logfile.close() logfile02.close() return(slipnodic) def matchexepttaggedstring(string,text): ''' Finds all matches of string in text except strings already tagged and returns them as a list. Parameters ---------- string : str text : str Returns ------- liste : list ''' templist = re.findall('(?:(?:)|[^])?'+string+'(?:(?:)|[^])?',text) liste = [] [liste.append(x) for x in templist if not re.search(r'<(\d+)>{}'.format(string),x)] return(liste) #Replacing the strings in original text by tagged strings def replacewithfrqcheck(stringori,frqindex,tagstring,slipno,sliptextstring,logfile): ''' Looks up sakuindata strings in slip text and replaces them with tagged string In case of inconsistencies in number of occurrences, extended strings will be applied in order to exclude unnecessary overlaps Parameters ---------- stringori : str tagstring : str frqindex : int sliptextstring : str logfile : TextIOWrapper Returns ------- None. ''' #creating list of appliable strings #using extended strings in order to exclude tagged strings xstringori = matchexepttaggedstring(stringori,sliptextstring) #checking frequency inconsistency frqtext = len(xstringori) if frqtext > frqindex:#Partial overlap between different stringori, such as 尉守蜀 and 尉 in slip 8-0652a+8-0067a, causes inflation in text matches logfile.write('frequency inconsistency for {} in slip {} caused by overlapping text\n' 'frequency in index is {} , frequency in text is {}\n' 'Full text is as follows:\n{}\n' '\n'.format(stringori,slipno,frqindex, frqtext,sliptextstring)) elif frqtext < frqindex:#string variations between text and index causes mismatches logfile.write('frequency inconsistency for {} in slip {} caused by mismatching\n' 'frequency in index is {} , frequency in text is {}\n' 'Full text is as follows:\n{}\n' '\n'.format(stringori,slipno,frqindex, frqtext,sliptextstring)) for n in range(len(xstringori)): xtagstring = re.sub(stringori,tagstring,xstringori[n])#tagging extended string sliptextstring = re.sub(xstringori[n],xtagstring,sliptextstring,1)#replacing string in sliptext with tagged extended string return(sliptextstring) def sliptextreplacement(slipnodic,sliptext,filename): ''' Replaces original string with tagged string and creates a new slip text data file with tags Parameters ---------- slipnodic : dic {slipno : [[[stringori],[stringreg],[ID],[lemmatano],frq,[reflemno],[reffrq] ['brackets',[(strwithbr,strwithoutbr)], 'fractures',[(strwithfrmark,strwithoutfrmark)], 'addinfo',[(strwithoutaddinfo,strwithaddinfo)]], [tagstring],[lemmata]]]} sliptext : dic {slipno:[sliptext,docstyle,posinstyle,posindoc]} logfile : TextIOWrapper Returns ------- sliptexttag : dic {slipno:[sliptext,docstyle,posinstyle,posindoc]} ''' sliptexttag = copy.deepcopy(sliptext) logfile = open(filename,'w',encoding='utf-8-sig') #Checking slip numbers slipnodickeys = list(slipnodic.keys()) for slipno in slipnodickeys: if '~' in slipno:#index data stretch over two slips tempslipno = slipno.split('~') if len(slipnodic[slipno]) > 1 : logfile.write('Straddling data of slip {} has more than one record'.format(slipno)) temprecord = [[],[]] if tempslipno[0] not in slipnodic.keys(): slipnodic[tempslipno[0]] = [] temprecord[0] = temprecord[1] = copy.deepcopy(slipnodic[slipno][0]) temprecord[0][0][0] = re.sub('#.+$','',temprecord[0][0][0]) temprecord[0][8][0] = re.sub('#.+$','',temprecord[0][8][0]) slipnodic[tempslipno[0]].append(temprecord[0]) if tempslipno[1] not in slipnodic.keys(): slipnodic[tempslipno[1]] = [] temprecord[1][0][0] = re.sub('^[^#]+#','',temprecord[1][0][0]) temprecord[1][8][0] = re.sub('^[^#]+#','',temprecord[1][8][0]) slipnodic[tempslipno[1]].append(temprecord[1]) elif slipno not in sliptexttag.keys(): logfile.write('slip number {} not found in slip text\n'.format(slipno)) #Replacing strings for slipno in slipnodic.keys(): if '~' in slipno or slipno not in sliptexttag.keys():continue for n in range(len(slipnodic[slipno])): stringori,frqindex,tagstring = slipnodic[slipno][n][0][0],slipnodic[slipno][n][4],slipnodic[slipno][n][8][0] sliptexttag[slipno][0] = replacewithfrqcheck(stringori,frqindex,tagstring,slipno,sliptexttag[slipno][0],logfile) logfile.close() return(sliptexttag) def histdatetagging(histdates,filename): ''' Creates tags for historical dates and inserts them in pertinent strings Parameters ---------- hitsdates : dic {slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote]]} filename : TextIOWrapper Returns ------- histdates : dic {slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote,tagstring]]} ''' logfile = open(filename,'w',encoding='utf-8-sig') for slipno in histdates.keys(): for n in range(len(histdates[slipno])): stringreg = histdates[slipno][n][2] tags = [histdates[slipno][n][4]] tagobjects = [histdates[slipno][n][2]] tagstring = inserttag(stringreg,tags,tagobjects,'','',n,slipno,histdates[slipno][n],logfile) tags = ['histdate'] tagstring = inserttag(tagstring,tags,tagobjects,'(<\d+>)*','()*',n,slipno,histdates[slipno][n],logfile) histdates[slipno][n].append(tagstring) return(histdates) def histdatesreplacemnt(histdates,sliptext,filename): ''' Replaces historical date strings in slip text with tagged string Parameters ---------- histdates : dic {slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote,tagstring]]} sliptext : dic {slipno:[sliptext,docstyle,posinstyle,posindoc]} filename : str Returns ------- sliptext : dic {slipno:[sliptext,docstyle,posinstyle,posindoc]} ''' logfile = open(filename,'w',encoding='utf-8-sig') for slipno in histdates.keys(): if slipno not in sliptext.keys(): logfile.write('slip number {} not found in slip text data\n\n'.format(slipno)) continue for n in range(len(histdates[slipno])): stringori,frqindex,tagstring = histdates[slipno][n][0],histdates[slipno][n][1],histdates[slipno][n][7] #stringori = tools.searchpreparation(stringori,'[# ☒■]*') sliptext[slipno][0] = replacewithfrqcheck(stringori,frqindex,tagstring,slipno,sliptext[slipno][0],logfile) return(sliptext) if __name__ == "__main__": zero = 0