# -*- coding: utf-8 -*- """ Created on Fri Apr 26 10:07:13 2024 @author: Arnd Helmut Hafner input method for indexdata02.py, inputting index data """ import sys # for sys.exit after error import re from re import compile, search import tools #General input-related functions def nondataline(conditions, line): '''Checks if a data line doesn't contain substantial data. meant to exclude headlines and page number lines etc. Parameters ---------- conditions : List A list of regex that hit non-data-related lines. Differs from index to index line : str one data line Returns ------- True/False True = unrelated line ''' for i in range(len(conditions)): if re.search(conditions[i],line):return('true') return('') def headwordcheck(condition,line): '''Checks if line contains headword. Returns the word if so. Parameters ---------- condition : str Contains regex to recognize headlines and return headwords Can only hit exactly after excluding non-related lines with nondataline line : str index data line Returns ------- Headwords ''' m = re.search(condition, line) if m:return(m.group(1)) return('') def slipnotecheck(string): '''Checks whether slip number or string has a note in form of '*' or '#' attached. Returns note separated from number Parameters ---------- string : Str Eventually entailing a '*' or '#' as a remark Returns ------- string : Str slipnote : Str ''' slipnote = '' if '*' in string: string = re.sub('\*','',string) slipnote = '*' elif '#' in string: string = re.sub('#','',string) slipnote = '#' return(string,slipnote) #Input related functions for personal name index def persnamestandard(headword): '''Creates headword and its standardized form, and returns them in form of a tuple Parameters ---------- headword : str Same as persname. regex : re.compile('({[^}]+}|.)((.)??)') Returns ------- None. ''' #Regex to check the persname structure and return standardized form #Hits "字(字)" and "{字+字}(字)", and returns character in round brackets regex = compile('({[^}]+}|.)((.)??)') if '(' in headword: headst = regex.sub('\g<2>',headword) elif '【' in headword: headst = headword headword = re.sub('【……】','〼',headword) #〼 :expression for broken slips in original text #'【……】':standardized expression for broken slips else: headst = headword return(headword,headst) def persnameitems(line): '''Analyses data line and returns results as Match object. Data lines in persname index have two different patterns. Pattern1: Headword,space,slipnumber,space,notes1,space,notes2,space,notes3 Pattern2: Slipnumber,space,notes1,space,notes2,space,notes3 Parameters ---------- line : str Data line Returns ------- m : match object In both cases, m.group(1)=slipnumber,m.group(2~4) = notes ''' m = re.search('^[^ \dJ]+ ([^ ]+) *([^ ]*) *([^ ]*) *([^ ]*)$', line) if m:return(m) return(re.search('([^ ]+) *([^ ]*) *([^ ]*) *([^ ]*)$', line)) def persnameinput(lemmata,lemmatano,items,ID ='persname',filename = 'indexdata(personal names).txt'): '''Reads data of personal name index Parameters ---------- lemmata : list [(headword,headst,ID)] (persname index doesn't discern lemmata and strings!) lemmatano : int counting the number of lemmata, will be stored in items items : list [(stringori,stringreg,lemmatano,slipnumber,slipnote,notes)] ID : str sort of index The default is 'persname'. filename : str Name of file containing personal name index data The default is 'indexdata(personal names).txt'. Returns ------- lemmata : list list of headwords lemmatano: int number of accumulated lemmata items : list list of index items ''' datafile = open(filename, 'r', encoding='utf-8-sig') for line in datafile: line = re.sub('\n','',line) #Excluding non-data-related lines if nondataline([' →','(続き','^[\.<]','★'],line): continue#'★' doesn't exist in the original data. It is used to manually eliminate individual records from being processed. #Extracting headwords persname = headwordcheck('^([^ \dJ]+)',line)#Checks if line contains personal name. Returns the name if so. if persname: lemmatano +=1 stringori, stringreg = persnamestandard(persname)#Creates strings out of headword, and returns them in form of a tuple #Adding data to lemmata lemmata.append((stringreg,stringreg,ID)) #Tests if line contains other data and proceeds to next data line if not if re.search('^[^\d]+$', line):continue #Extracting items m = persnameitems(line)#Analyses data line and returns results as Match object if m: slipnumber, slipnote = slipnotecheck(m.group(1))#Checks whether slip numbers have a note in form of '*' or '#' attached notes = m.group(2) + m.group(3) + m.group(4) items.append((stringori,stringreg,lemmatano,slipnumber, slipnote,notes)) else: sys.exit('mismatch in persname:' + line) datafile.close() return lemmata,lemmatano,items #Input-related functions for official name and other indices def officialaoitems(line): '''Analyses data line and returns strings and slip numbers for official name and other indices Structure of data lines: stringreg,tab,slipnumber,tab,stringori (If stringreg = stringori, stringori is omitted) Parameters ---------- line : str Returns ------- (stringreg,slipnumber,stringori) ''' m = re.search('^([^\t→←]+)\t([^\t]+)\t*([^\t]*)$', line)#[^→←]excludes reference items if not(m):return('','','') if m.group(3):return(m.group(1),m.group(2),m.group(3))#(stringreg,slipnumber,stringori) return(m.group(1),m.group(2),m.group(1))#stringreg=stringori def officialaorefitems(line): #行の書式を確認しつつ索引の参照項目を抽出する。 #行頭の矢印を除けば、書式は通常の個別項目と同じだから、矢印の有無を確認した上で、ファンクションkomokuchushutuで実際に項目を抽出する m = re.search('^[←→]',line) if m:return(officialaoitems(re.sub('[←→]','',line))) return('','','') def officialandotherinput(lemmata,lemmatano,items,refitems,ID,filename): '''Reads data of the three reference items entailing indices: official name index, rank index, labor related term index Parameters ---------- lemmata : list [(headword,headst,ID)] lemmatano : int counting the number of lemmata, will be stored in items items : list [(stringori,stringreg,lemmatano,slipnumber,slipnote,notes)] refitems: list [(stringori,stringreg,lemmatasu,slipnumber,slipnote,notes)] ID : filename : str Name of file containing personal name index data The default is 'indexdata(personal names).txt'. Returns ------- lemmata : list list of headwords lemmatano: int number of accumulated lemmata items : list list of index items refitems : List list of reference items ''' notes = ''#These indices don't entail notes datafile = open(filename, 'r', encoding='utf-8-sig') for line in datafile: line = re.sub('\n','',line) #Excluding non-data-related lines if nondataline([' →','^[\.<]','★'],line): continue #Extracting headwords headword = headwordcheck('^([^\d]+)$',line)#In these three indices, headwords appear only in separate headlines without slip numbers. if headword: lemmatano += 1 headst = re.sub('(.+$','',headword)#headwords entail explanatory information in round brackets like 安陽(県) #Adding data to lemmata lemmata.append((headword,headst,ID)) continue #Extracting items stringreg,slipnumber,stringori = officialaoitems(line) if stringreg: slipnumber, slipnote = slipnotecheck(slipnumber) items.append((stringori,stringreg,lemmatano,slipnumber, slipnote,notes)) continue #Extracting reference items stringreg,slipnumber,stringori = officialaorefitems(line) if stringreg: slipnumber, slipnote = slipnotecheck(slipnumber) refitems.append((stringori,stringreg,lemmatano,slipnumber,slipnote,notes)) continue else:sys.exit('mismatch in officialandotherinput:' + line) datafile.close() return lemmata,lemmatano,items,refitems #Input-related functions for place name index def pllinebreakdown(line): '''Extracts headwords and items from place name index Data line structure: headword,prop,slipnumberlist,note1,note2 (prop = property of place name, e.g. province, prefecture etc.) Parameters ---------- line : str Index data line Returns ------- headword,ID,slipnumberlist,note1,note2 ''' m = re.search('^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)$', line) if m:return(m.group(1),m.group(2),m.group(3),m.group(4),m.group(5)) else:sys.exit('mismatch in place name index data line\n{}'.format(line)) def plstringformatting(headst,stringori): '''Formats the place name strings. Place name index preserves original strings in two ways. 1)The most common version is preserved in "AB" of the headword format "AB(CD)".(CD=standardized form) 2)Irregular versions are put in round brackets behind the slip numbers Irregular versions behind slip numbers can have different formats 2-1)AB,corresponding to 1) (This function changes "AB(CD)" into "A(B)C(D)" which is the common format) 2-2)C, omiting a character (This function adds the omitted character in round brackets) Parameters ---------- headst : str 見出し語の正規化表記 stringori : str 文字列の原文表記 Returns ------- stringori,stringreg ''' if len(headst) == len(stringori):#turns AB(CD) into A(C)B(D) stringreg = headst tempstringori = '' for i in range(len(headst)): if stringori[i] == headst[i]: tempstringori += stringori[i] else: tempstringori += stringori[i] + '(' + headst[i] + ')' stringori = tempstringori elif len(headst) == len(re.sub('[]','',stringori)) or len(headst) == len(re.sub('(?)','',stringori)): stringreg = headst elif len(headst) > len(stringori): stringreg = '' for i in range(len(headst)): if headst[i] in stringori: stringreg += headst[i] else: stringreg += '(' +headst[i] + ')' elif len(headst) < len(stringori):#string includes non-unicode characters stringreg = headst tempstringori = '' charlist = re.findall('{[^}]+}',stringori)#storing all non-unicode characters for n in range(len(charlist)): stringori = re.sub(charlist[n],str(n),stringori,1) if len(headst) == len(stringori): stringreg,stringori = plstringformatting(headst,stringori) else: sys.exit('unequal length in stringorikeishiki\n' + headst + '\t' + stringori) for n in range(len(charlist)): stringori = re.sub(str(n),charlist[n],stringori,1) return(stringreg,stringori) def plheadwordprocessing(headword,prop): '''Processes the headwords of name place index and brings them into similar shape as other indices. Strings = Headwords → Headwords incorporate stringori and stringreg Parameters ---------- headword : str prop : str Returns ------- headword : str headst : str standardized form of headword. stringori : str ''' m = re.search('([^(]+)(([^)]+))', headword) if m: headst = m.group(2) stringori = m.group(1) headst,stringori = plstringformatting(headst,stringori) else: headst = headword stringori = headword headst = re.sub('〼','【……】',headst) headword = headst + '(' + prop + ')' return (headword,headst,stringori) def plstringcreation(headst,stringori,slipnumber): m = re.search('^([^(]+)((([^)]|(?))+))$',slipnumber) if m : slipnumber = m.group(1) stringori = m.group(2) if '〼' not in stringori:stringreg, stringori = plstringformatting(headst, stringori) else : stringreg = headst else : stringreg = headst #地名索引では、もともと原文と正規表記の区別がなかったが、、タグ付けにおいて必要となるため追記した。 return(stringreg,stringori,slipnumber) def plmultipleappearance(slipnumber): '''簡番号の後ろに出現回数が注記されているかを確認する。 地名索引には元は出現回数が注記されていないが、 釈文のタグ付けに必要になるから、 タグ付けにおける誤作動を手掛かりに手動で追記した。 追記の書式は次の通り [kanslipnumber kaisu](つまり、簡番号の後ろに半角のスペースを置いて数字で回数を表す。) 回数が一回の場合追記を省略する Parameters ---------- slipnumber : str Returns ------- bagno : str 回数に関する追記は削除済み kaisu : int 出現回数 ''' m = re.search('^([^ ]+) (\d+)',slipnumber) if m: slipnumber = m.group(1) kaisu = int(m.group(2)) else:kaisu = 1 return(slipnumber,kaisu) def placenameinput(lemmata,lemmatano,items,ID,filename): ''' Reads data of place name index Parameters ---------- lemmata : list [(headword,headst,ID)] lemmatano : int counting the number of lemmata, will be stored in items items : list [(stringori,stringreg,lemmatano,slipnumber,slipnote,notes)] ID : str sort of index filename : str Name of file containing personal name index data The default is 'indexdata(personal names).txt'. Returns ------- lemmata : list list of headwords lemmatano: int number of accumulated lemmata items : list list of index items chimeinotes : ''' notes =''##The place name index doesn't have ordinary notes plnotes = []#acculumating place name notes together with headst datafile = open(filename, 'r', encoding='utf-8-sig') logfile = open('output(placename overlapping with official names).txt','w', encoding='utf-8-sig') for line in datafile: line = re.sub('\n','',line) #Excluding non-data-related lines if nondataline(['→','^[\.<]','★'],line): continue #Extracting headwords and items headword,prop,slipnumberlist,note1,note2 = pllinebreakdown(line) #Excluding data lines that overlap with official name index. Storing data in logfile if re.search('^[郷県道亭郡津国関郵]',prop): logfile.write('{}\t{}\t{}\t{}\n{}\n'.format(headword,prop,note1,note2,slipnumberlist)) continue lemmatano += 1 #Processing headwords headword,headst,stringori = plheadwordprocessing(headword,prop) #Adding data to lemmata lemmata.append((headword,headst,ID)) #Adding data peculiar to place name index to plnotes plnotes.append((headword,note1,note2)) #Breaking down slipnumberlist and creating strings for each number stringreg = headst tempstringori = stringori#remembering value of stringori temporarily slipnumberlist = slipnumberlist.split(',') for slipnumber in slipnumberlist: #Checking whether multiple appearances are noted behind slip number slipnumber,multiple = plmultipleappearance(slipnumber) #Checking slipnotes slipnumber,slipnote = slipnotecheck(slipnumber) #Checking whether irregular original writings are noted behind slip number #(It is confirmed that there are no cases of coincidence of slipnumber notes and notes about irregular writings) stringreg,stringori,slipnumber = plstringcreation(headst,tempstringori,slipnumber) #adding items for n in range(multiple): items.append((stringori,stringreg,lemmatano,slipnumber,slipnote,notes)) datafile.close() return lemmata,lemmatano,items,plnotes def sliptextinput(filename): ''' reads the original slip text and returns it as a dictionary accessible by slip number Parameters ---------- filename : str Structure of file (One slip, two or more lines): docstyle,tab,posinstyle,tab,posindoc,tab,slipnumber sliptext sliptext ... docstyle:document style posinstyle: position of document within a group of same docstyle posindoc: position of slip within the same document Returns ------- sliptext : dict {slipnumber:[sliptext,docstyle,posinstyle,posindoc]} slipnolist : list Remembers slip numbers in order of appearance in order to check with original file ''' sliptext = {} datafile = open(filename, 'r', encoding='utf-8-sig') slipno = '' slipnolist = [] for line in datafile: line = re.sub('\n','',line) m =re.search('^([\d\-]+)\t(\d+)\t(\d+)\t([\d\-+abcdJ⑯⑦⑨⑫⑬⑭]+)$',line) if m:#Hits a line with metadata concerning docstyle etc. slipno = m.group(4) slipnolist.append(slipno) sliptext[slipno] = ['',m.group(1),m.group(2),m.group(3)] else:#Hits lines with slip text. In case of multiline texts, linebreak '#' is inserted at end of each line. if sliptext[slipno][0]: sliptext[slipno][0] += '#' sliptext[slipno][0] += line #最後の簡についても、釈文の末尾の改行記号'#'を削除する sliptext[slipno][0] = re.sub('#$','',sliptext[slipno][0]) return(sliptext,slipnolist) def sliptextcheckoutput(sliptext,slipnolist,filename): dataoutput = open(filename, 'w', encoding='utf-8-sig') for slipno in slipnolist: tempstring =re.sub('#','\n',sliptext[slipno][0]) dataoutput.write('{}\t{}\t{}\t{}\n{}\n'.format(sliptext[slipno][1],sliptext[slipno][2],sliptext[slipno][3],slipno,tempstring)) dataoutput.close() print('outputed {} items to {}'.format(len(slipnolist),filename)) def manualtagginginput(filename): ''' Reads manually tagged strings. Manually tagging strings that couldn't be reconstructed according to original strings in dataprocessing.stringprepremoval (log(tagremovalmanualcorrectionneeded.txt) Parameters ---------- filename : str Returns ------- manualtagging : dic {slipno+stringori:tagstring} ''' manualtagging = {} linenumber = 0 datafile = open(filename, 'r', encoding='utf-8-sig') for line in datafile: linenumber += 1 line = re.sub('\n','',line) m =re.search('^([^\t]+)\t([^\t]+)\t([^\t]+)$',line) if m:manualtagging[m.group(1)+m.group(2)] = m.group(3) else:sys.exit('mismatch in manualtagginginput line {}\nline is {}' .format(linenumber,line)) datafile.close() return(manualtagging) #Input-related functions for historical date index def readdates(filenamedata = 'indexdata(dates).txt',filenamelog = 'log(reading histdates).txt'): ''' Reads data of historical date index Parameters ---------- filenamedata : str, optional Name of datafile. The default is 'indexdata(dates).txt'. Data structure: stringori,tab,stringreg,tab,slipnumber,tab,IDchr,tab,IDBCE IDchr: nnnnnnnn(nnn=year,nnn=month,nn=day) year: 101=秦王政元年...125=秦王政二十五年;126=秦始皇二十六年...137=秦始皇三十七年 201=秦二世元年 month: 010=十月,011=十一月,012=十二月,101=正月,102=二月...109=九月,119=後九月 unknownyears: 999=□□年;193/192=卅□年/廿□年;992/995=□二年/□五年 unknownmonths: 999=□(□)月;019=十□月 filenamelog : str, optional Name of logfile. The default is 'log(reading histdates).txt' Returns ------- histdates : dic {slipnumber:[[stringori,frq,stringreg,slipno,IDchr,IDBCE,slipnote]]} ''' histdates = {} linenumber = 0 datafile = open(filenamedata, 'r', encoding='utf-8-sig') logfile = open(filenamelog, 'w', encoding='utf-8-sig') for line in datafile: linenumber += 1 m =re.search('^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)$',line) if m: stringori,stringreg,slipno,IDchr,IDBCE=m.group(1),m.group(2),m.group(3),m.group(4),m.group(5) else:sys.exit('mismatch inreaddates line {}\nline is {}' .format(linenumber,line)) slipnote = '' if '*' in stringori: stringori = re.sub('\*','',stringori) slipnote = '*' stringreg = re.sub('\*','',stringreg) if slipno in histdates.keys(): exist = False for n in range(len(histdates[slipno])): if stringori == histdates[slipno][n][0]: histdates[slipno][n][1] += 1 exist = True if stringreg != histdates[slipno][n][2]: logfile.write('inconsistency in histdates\n{}\n{}' '\n\n'.format(histdates[slipno][n],[stringori,1,stringreg,slipno,IDchr,IDBCE])) break if not exist: histdates[slipno].append([stringori,1,stringreg,slipno,IDchr,IDBCE,slipnote]) else: histdates[slipno] = [[ stringori,1,stringreg,slipno,IDchr,IDBCE,slipnote]] datafile.close() #Sorting histdates in declining order of length of stringori for slipno in histdates.keys(): histdates[slipno] = sorted(histdates[slipno] , key=lambda x: len(re.findall('.',x[0]))*-1) return(histdates) if __name__ == "__main__": zero = 0