- 先用CSNovelCrawler把要抓的小說抓下來,CSNovelCrawler可以從這裡抓:http://rngmontoli.blogspot.tw/2013/06/csnovelcrawler.html
- CSNovelCrawler抓下來的檔案是純文字檔,得把它轉成HTML或是doc檔案。
- 轉成HTML檔案
- 轉成DOC檔案:
- 用calibre把HTML或是DOC檔案轉成epub
最終放棄DOC檔案的契機是,我不是每台電腦都有WORD啊。要換成open office還得重寫open office的script。算了,試試看HTML檔案吧。
還有一點要注意:用Calibre把HTML檔轉成epub的時候,因為它預設是不會把<h2>標籤輸出成epub的bookmark,所幸可以手動指定(第一級目錄頁設為 //h:h1,第二級目錄頁設為//h:h2):
- 開檔案
- 把一些字轉換掉(主要是空白跟簡體)
- 一行一行找"第x章"這類的文字,有的話就加上<h2>標籤,不然就是<p>標籤。
- 如果是<h2>標籤的文字,再把中文數字轉成阿拉伯數字,原因是我不喜歡中文數字,很不直覺跟佔字數。"一千一百一十八"跟"1118"比,我還是喜歡1118。直接在網路上找到這個用即可(https://github.com/binux/binux-tools/blob/master/python/chinese_digit.py)。
還有一點要注意:用Calibre把HTML檔轉成epub的時候,因為它預設是不會把<h2>標籤輸出成epub的bookmark,所幸可以手動指定(第一級目錄頁設為 //h:h1,第二級目錄頁設為//h:h2):
import os, sys import os, sys dict ={u'零':0, u'一':1, u'二':2, u'三':3, u'四':4, u'五':5, u'六':6, u'七':7, u'八':8, u'九':9, u'十':10, u'百':100, u'千':1000, u'萬':10000, u'0':0, u'1':1, u'2':2, u'3':3, u'4':4, u'5':5, u'6':6, u'7':7, u'8':8, u'9':9, u'0' :0, u'1' :1, u'2' :2, u'3' :3, u'4' :4, u'5' :5, u'6' :6, u'7' :7, u'8' :8, u'9' :9, u'壹':1, u'貳':2, u'參':3, u'肆':4, u'伍':5, u'陸':6, u'柒':7, u'捌':8, u'玖':9, u'拾':10, u'佰':100, u'仟':1000, u'萬':10000, u'兩':2, u'億':100000000} def GetResultForDigit(a, encoding="utf-8"): count = 0 result = 0 tmp = 0 Billion = 0 while count < len(a): tmpChr = a[count] #print tmpChr tmpNum = dict.get(tmpChr, 0) #如果等于1亿 if tmpNum == 100000000: result = result + tmp result = result * tmpNum #获得亿以上的数量,将其保存在中间变量Billion中并清空result Billion = Billion * 100000000 + result result = 0 tmp = 0 #如果等于1万 elif tmpNum == 10000: result = result + tmp result = result * tmpNum tmp = 0 #如果等于十或者百,千 elif tmpNum >= 10: if tmp == 0: tmp = 1 result = result + tmpNum * tmp tmp = 0 #如果是个位数 elif tmpNum is not None: tmp = tmp * 10 + tmpNum count += 1 result = result + tmp result = result + Billion return str(result) def Katino_format( input_file_name, author ): filename, file_extension = os.path.splitext(input_file_name) output_file_name = filename + '.html' with open(input_file_name, 'r', encoding='utf8') as input_file: print('file is opened.'); with open(output_file_name, 'w', encoding='utf8') as output_file: output_file.write("<!DOCTYPE html>\n"); output_file.write("<html>\n"); output_file.write("<title>" + filename + "</title>\n"); output_file.write("<meta name=Author content=\"" + author + "\">"); output_file.write("<body>\n"); output_file.write("<h1>" + filename + "</h1>\n"); for line in input_file: #print(line); # replace redundent line break line = line.replace("\n\n", "\n"); # remove 4 spaces line = line.replace(" ", ""); # remove 2 full-size spaces line = line.replace(" ", ""); # replace 1 full-size space with half-size space line = line.replace(" ", " "); # replace 1 ? space with half-size space line = line.replace(" ", " "); # replace '(' with '(' line = line.replace("(", "("); # replace ')' with ')' line = line.replace(")", ")"); # replace ',' with ',' line = line.replace(",", ","); line = line.replace(":", ":") line = line.replace("!", "!"); line = line.replace("?", "?"); line = line.replace("隻是", "只是"); line = line.replace("麵對", "面對"); # find 第x章 keyword1_pos = line.find("章"); keyword2_pos= -1 chapter_find = 0; if keyword1_pos >=0 and keyword1_pos < 10: keyword2_pos = line.find("第"); if keyword2_pos >= 0 and keyword2_pos < 5 and keyword2_pos < keyword1_pos: # replace ' ' with ' ' line = line.replace(" ", " "); # add ' ' if no ' ' after '章' if line[keyword1_pos+1] != ' ': line = line[:keyword1_pos+1] + ' ' + line[keyword1_pos+1:]; # try convert chinese number to arabic number chinese_number = line[keyword2_pos+1:keyword1_pos]; arabic_number = GetResultForDigit(chinese_number) print(line + ": " + chinese_number + "-->"+ arabic_number) line = line.replace(chinese_number, arabic_number); # it is <h2> chapter_find = 1; if chapter_find == 1: line = line.replace("\n", ""); output_file.write("<h2>" + line + "</h2>\n"); else: if line != "\n": line = line.replace("\n", ""); output_file.write("<p>" + line + "</p>\n"); output_file.write("</body>\n"); output_file.write("</html>\n"); output_file.close() input_file.close() return author = '蝴蝶藍'; Katino_format('[蝴蝶藍] 天醒之路[1].txt', author) print('done.');