# -*- coding: utf-8 -*- """ Created on Mon Mar 18 17:36:13 2024 @author: Arnd Helmut Hafner processing index data in order to identify items in original text and add tags """ import sys # for sys.exit after error import re from re import compile, search import copy import tools import inputmethods import dataprocessing #reading index data #main variables #lemmata #list of tuples containing headwords, standardized form of headwords and index ID lemmata = []#[(headword,headst,ID)] lemmatano = -1#counting the number of lemmata. Will be stored in items #items #list of tuples containing original items, standardized form of items, number of headword,slip number, slip number notes and gengeral notes items = []#[(stringori,stringreg,lemmatano,slipnumber,slipnote,notes)] #refitems #list of tuples containing same information for reference items as items for ordinary items refitems = []#[(stringori,stringreg,lemmatasu,slipnumber,slipnote,notes)] #reading personal name data filename = 'indexdata(personal names).txt' ID = 'persname' lemmata,lemmatano,items = inputmethods.persnameinput(lemmata,lemmatano,items,ID,filename) #reading official names and data of two other indices (Three indices entail reference items) officialandofilename = ('indexdata(offices).txt','indexdata(ranks).txt','indexdata(labor).txt') IDs = ('office','rank','labor') for i in range(3): filename = officialandofilename[i] ID = IDs[i] lemmata,lemmatano,items,refitems = inputmethods.officialandotherinput(lemmata,lemmatano,items,refitems,ID,filename) #Reading place name index data filename = 'indexdata(place names).txt' ID = 'placename' lemmata,lemmatano,items,plnotes = inputmethods.placenameinput(lemmata,lemmatano,items,ID,filename) #Outputting data dictlist = (lemmata,items,refitems) filenames = ('output(lemmata).txt','output(items).txt','output(refitems).txt') fieldnumber = (3,6,6) for i in range(len(fieldnumber)): tools.tuplelistoutput(dictlist[i],fieldnumber[i],filenames[i]) #creating a dictionary for access to data by slip number in order to match with original text #includes consistency check on appearance frequencies slipnodic = dataprocessing.createdic(lemmata,items,refitems) tools.dictoutput(slipnodic,'output(slipnumberdictionary)01.txt','\n') #reading slip text sliptext,slipnolist = inputmethods.sliptextinput('shakumon(linknuki).txt') #inputmethods.sliptextcheckoutput(sliptext,slipnolist,'logsliptextinput.txt') #preliminary consistency check between index data and slip text data dataprocessing.simpleconsistencycheck(slipnodic,sliptext,'logsliptextconsistency(simple check)01.txt') #Replacing index data strings with text data strings #stringori is already a simplified version of the original text data strings. #For accurate tagging of original text data, strings must be reversed into their original form slipnodic = dataprocessing.stringorireplacement(slipnodic,sliptext,'logsliptextconsistency(stringreplacement)01.txt') tools.dictoutput(slipnodic,'output(slipnumberdictionary)02.txt','\n') #Stringpreparation #Information necessary for matching is spread among the original strings and standardized strings. #The following module concentrates the information into one string and makes it matchable slipnodic = dataprocessing.stringprep(slipnodic,'logstringpreparation.txt') tools.dictoutput(slipnodic,'output(slipnumberdictionary)03.txt','\n') #Consolidation of overlapping index data slipnodic = dataprocessing.indexdataconsolidation(slipnodic,'logindexdataconsolidation.txt') tools.dictoutput(slipnodic,'output(slipnumberdictionary)04.txt','\n') #Tagpreparation slipnodic = dataprocessing.tagprep(slipnodic,lemmata,'logtagpreparation.txt') tools.dictoutput(slipnodic,'output(slipnumberdictionary)05.txt','\n') #Tagging of the strings slipnodic = dataprocessing.tagging(slipnodic,'logtagging.txt') tools.dictoutput(slipnodic,'output(slipnumberdictionary)06.txt','\n') #Restoration of original string = Removal of string preparations slipnodic = dataprocessing.stringprepremoval(slipnodic,lemmata,'logtagremoval.txt') tools.dictoutput(slipnodic,'output(slipnumberdictionary)07.txt','\n') #Reading historical date index data histdates = inputmethods.readdates('indexdata(dates).txt','log(reading histdates).txt') histdates = dataprocessing.histdatetagging(histdates,'log(tagging histdates).txt') tools.dictoutput(histdates,'output(histdates).txt','\n') #Replacing the strings in original text by tagged strings newsliptext = dataprocessing.sliptextreplacement(slipnodic,sliptext,'logsliptextreplacement.txt') newsliptext = dataprocessing.histdatesreplacemnt(histdates,newsliptext,'log(histdate string replacement.txt') output = open('shakumon(with tags).txt','w', encoding='utf-8-sig') for n in range(len(slipnolist)): slipno = slipnolist[n] newsliptext[slipno][0] = re.sub('#','\n',newsliptext[slipno][0]) output.write('{}\t{}\t{}\t{}\n{}\n'.format(newsliptext[slipno][1], newsliptext[slipno][2], newsliptext[slipno][3], slipno,newsliptext[slipno][0])) print('outputted {}.items to shakumon(with tags).txt'.format(n+1)) output.close()