# -*- coding: utf-8 -*- """ Created on Sun Jun 9 21:27:20 2024 @author: Arnd Helmut Hafner """ import sys # for sys.exit after error import re def sliptextinput(filename): ''' reads the original slip text and returns it as a dictionary accessible by slip number Parameters ---------- filename : str Structure of file (One slip, two or more lines): docstyle,tab,posinstyle,tab,posindoc,tab,slip number sliptext sliptext ... docstyle:document style posinstyle: position of document within a group of same docstyle posindoc: position of slip within the same document Returns ------- sliptext : dict {slipnumber:[sliptext,docstyle,posinstyle,posindoc]} slipnolist : list Remembers slip numbers in order of appearance in order to check with original file ''' datafile = open(filename, 'r', encoding='utf-8-sig') sliptext = []#[[簡番号,釈文]] = 一簡面の釈文を簡番号と共に収録するリストのリスト sliptextID = -1 monjo = []#[[SlipID+SlipID,youshikiID]] = 一つの文書を構成する簡面のIDと様式分類のIDを収めるリストのリスト monjoID = -1 yousiki = []#[[様式記号,様式名称]]様式記号と様式名称のリストのリスト。 #様式の名称はshakumon.txtに含まれないから、 それを空の文字列で代替する yousikiID = -1 filetext = []#[[monjoID]] = 釈文ファイルにおける文書出現を記録するリスト #釈文ファイルの順番に作成されるから、 このリストのIDは文書のIDと一致するが、理論的に将来の編集によって変わり得る。 for line in datafile: line = re.sub('\n','',line) m =re.search('^([\d\-]+)\t(\d+)\t(\d+)\t([\d\-+abcdJ⑦⑨⑩⑫⑬⑭⑯]+)$',line) if m:#Hits a line with metadata concerning docstyle etc. sliptextID += 1 sliptext.append([m.group(4),'']) if m.group(2) == '0' and m.group(3) == '0': yousikiID +=1 yousiki.append([m.group(1),'']) if m.group(3) == '0': monjoID +=1 monjo.append([str(sliptextID),yousikiID]) filetext.append([monjoID]) else: monjo[monjoID][0] += '+' + str(sliptextID) else:#Hits lines with slip text. In case of multiline texts, linebreak '#' is inserted at end of each line. if sliptext[sliptextID][1] == '': sliptext[sliptextID][1] = line else: sliptext[sliptextID][1] += '#' + line datafile.close() return(sliptext,monjo,yousiki,filetext) def outputtable(recordlist,filename,head): ''' Outputs a list of records as a CSV file Parameters ---------- recordlist : List List of records. Every record is a list with as many elements as there are fields filename : str output file head : str head line of csv file Returns ------- None. ''' outputfile = open(filename, 'w', encoding='utf-8-sig') outputfile.write(head + '\n') for n in range(len(recordlist)):#accessing records (1 record == 1 list line = str(n) for x in range(len(recordlist[n])):#accessing elements of list line += ',' + str(recordlist[n][x])#creating comma separated line outputfile.write(line + '\n') outputfile.close() if __name__ == "__main__": filename = 'shakumon(linknuki).txt' sliptext,monjo,yousiki,filetext = sliptextinput(filename) outputtable(sliptext,'sliptext.csv','ID,簡番号,釈文') outputtable(monjo,'monjo.csv','ID,構成簡,様式分類') outputtable(yousiki,'yousiki.csv','ID,記号,名称') outputtable(filetext,'filetext.csv','ID,文書')