import json import os import pathlib import random import sys import argparse from alive_progress import alive_bar import ijson import pydub #argument parser parser = argparse.ArgumentParser(description='json file path') parser.add_argument('--path',dest='filePath',type=str,help='target json file path') parser.add_argument('--language',dest='language',type=str,help='target language') parser.add_argument('--character',dest='character',type=str,help='target character') args = parser.parse_args() #test argument args.filePath = './test.json' args.language = 'CHS' args.character = '甘雨' # filePath = str(args.filePath) targetLanguage = str(args.language) targetCharacter = str(args.character) # todo : function to check if argument is valid or not # prefixlist generator for searching for target item def prefixListGenerator(filePath,targetLanguage,targetCharacter): prefixlist = [] with open(filePath,'r',encoding='utf-8') as f: parser = ijson.parse(f) for prefix, event, value in parser: if (prefix.endswith('.language') and value == targetLanguage): prefixsplit = str(prefix).split('.') prefixsplitedLanguage = prefixsplit[0] if (prefix.endswith('.npcName') and value == targetCharacter): prefixsplit = str(prefix).split('.') prefixsplitedCharacter = prefixsplit[0] if prefixsplitedLanguage == prefixsplitedCharacter : prefixlist.append(prefixsplitedCharacter) prefixlistLength = len(prefixlist) f.close() print("------ prefixlist grenerated length: "+str(prefixlistLength)+" ------") return prefixlist def resultJsonGenerator(prefixlist,filePath): #print("------ result.json grenerating ------") resultTemp = {} workspacePath = pathlib.Path.cwd() dstFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json')) print('------ result.json generating ------') with alive_bar(len(prefixlist)) as bar: for prefixitem in prefixlist: bar() if not dstFilePathCharacter.parent.exists(): os.makedirs(dstFilePathCharacter.parent) with open(filePath,'r',encoding='utf-8') as t: data = ijson.items(t, prefixitem) for dataItems in data : #update file suffix from .wem to .wav fileNameTemp = dataItems['fileName'] fileNameTemp = pathlib.Path(fileNameTemp).with_suffix('.wav') dataItems['fileName'] = str(fileNameTemp) if fileNameTemp.exists(): resultTemp.update({prefixitem:dataItems}) result = resultTemp else: prefixlist.remove(prefixitem) print('removed prefixitem: ' + str(prefixitem)+' reason: file no found') with open(dstFilePathCharacter,'w+',encoding='utf-8') as out: resultOut=json.dumps(result,ensure_ascii=False) out.write(resultOut) out.flush() out.close() t.close() return prefixlist def wavFileClassify(srcPath,dstPath): song = pydub.AudioSegment.from_file(srcPath) song = song.set_channels(1) song = song.set_frame_rate(22050) song.export(dstPath, format='wav') def datasetGenerator(prefixlist): workspacePath = pathlib.Path.cwd() dstFilePathdataset = pathlib.Path(workspacePath,targetCharacter,str('list.txt')) dstFilePathValdataset = pathlib.Path(workspacePath,targetCharacter,str('list_val.txt')) dstFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json')) with open(dstFilePathdataset,'a',encoding='utf-8') as dataset: with open(dstFilePathValdataset,'a',encoding='utf-8') as valDataset: print('------ dataset generating ------') with alive_bar(len(prefixlist)) as bar: for prefixitem in prefixlist: bar() with open(str(dstFilePathCharacter),'r',encoding='utf-8') as resultFile: data = ijson.items(resultFile, prefixitem) for dataItems in data: #generate training dataset list.txt if 'text' in dataItems : datastringPathTemp = pathlib.Path(str('wavs'),str(dataItems['fileName'])) datastring = str(datastringPathTemp.as_posix())+'|'+ str(dataItems['text'])+'\n' dataset.write(datastring) dataset.flush() #generate training valdataset list_val.txt if random.randint(1,10) == 1 : valDataset.write(datastring) valDataset.flush() dataset.close() valDataset.close() resultFile.close() # file select by filename in result.json def fileSelector(): filePathList = [] workspacePath = pathlib.Path.cwd() srcFilePathCharacter = pathlib.Path(workspacePath,targetCharacter,str('result.json')) if os.path.exists(srcFilePathCharacter): with open(str(srcFilePathCharacter),'r',encoding='utf-8') as resultFile: parser = ijson.parse(resultFile) for prefix, event, value in parser: if (prefix.endswith('.fileName')): filePathList.append(value) resultFile.close() print('------ classfying wav file with frame rate 22050 channels 1 ------') with alive_bar(len(filePathList),) as bar: for srcFilePathTemp in filePathList: bar() srcFilePath = pathlib.Path(workspacePath,srcFilePathTemp) dstFilePath = pathlib.Path(workspacePath,targetCharacter,'wavs',srcFilePathTemp) if dstFilePath.parent.exists() and srcFilePath.exists(): wavFileClassify(srcFilePath,dstFilePath) #print("classfying "+ str(srcFilePath)+'\nto '+str(dstFilePath)) elif srcFilePath.exists(): os.makedirs(dstFilePath.parent) wavFileClassify(srcFilePath,dstFilePath) #print("classfying "+ str(srcFilePath)+'\nto '+str(dstFilePath)) else: print('------ result.json do not exists! check for result.json generated or not ------') sys.exit(0) if __name__=="__main__": prefixlist = prefixListGenerator(filePath,targetLanguage,targetCharacter) prefixlist = resultJsonGenerator(prefixlist,filePath) datasetGenerator(prefixlist) fileSelector() print("------ done! check result.json for what you need ------")