長単位付きCabochaファイルを"Mecab|Cabocha|UniDic2|LUW"形式でインポートする処理を追加
@@ -46,7 +46,21 @@ | ||
46 | 46 | private LexiconBuilder m_LexBuilder = null; |
47 | 47 | private bool m_IsFolderInput; |
48 | 48 | |
49 | + public void ResetInternals() | |
50 | + { | |
51 | + m_Service = null; | |
52 | + m_Corpus = null; | |
53 | + m_DefaultString = null; | |
49 | 54 | |
55 | + m_SentencesInDocuments = new List<int>(); | |
56 | + m_DocumentSet = null; | |
57 | + m_DocumentSetProjectMapping = null; | |
58 | + m_User = null; | |
59 | + m_Project = null; | |
60 | + m_LexBuilder = null; | |
61 | + | |
62 | + } | |
63 | + | |
50 | 64 | /// <summary> |
51 | 65 | /// CreateCorpusの一連の処理を実行してDatabaseを作成する. |
52 | 66 | /// 呼び出し前にパラメータフィールドが適切に設定されている必要がある. |
@@ -821,7 +835,7 @@ | ||
821 | 835 | { |
822 | 836 | if (pos.ID < 0) |
823 | 837 | { |
824 | - pos.ID = ++ m_LexBuilder.LastLexemeID; | |
838 | + pos.ID = ++m_LexBuilder.LastLexemeID; | |
825 | 839 | } |
826 | 840 | cmd.CommandText = string.Format("INSERT INTO part_of_speech VALUES({0},'{1}','{2}','{3}','{4}','{5}')", |
827 | 841 | pos.ID, EscapeQuote(pos.Name1), EscapeQuote(pos.Name2), EscapeQuote(pos.Name3), EscapeQuote(pos.Name4), EscapeQuote(pos.Name)); |
@@ -1176,11 +1190,12 @@ | ||
1176 | 1190 | foreach (Word word in sen.Words) |
1177 | 1191 | { |
1178 | 1192 | word.ID = wordid++; |
1179 | - cmd.CommandText = string.Format("INSERT INTO word VALUES ({0},{1},{2},{3},{4},{5},{6},{7},null,null,null,'{8}',{9})", | |
1193 | + cmd.CommandText = string.Format("INSERT INTO word VALUES ({0},{1},{2},{3},{4},{5},{6},{7},null,null,null,'{8}',{9},{10})", | |
1180 | 1194 | word.ID, word.Sen.ID, word.StartChar, word.EndChar, word.Lex.ID, |
1181 | 1195 | (word.Bunsetsu == null) ? 0 : word.Bunsetsu.ID, m_DefaultString, word.Pos, |
1182 | 1196 | word.Extras, |
1183 | - m_Project.ID); | |
1197 | + m_Project.ID, | |
1198 | + (int)word.HeadInfo); | |
1184 | 1199 | cmd.ExecuteNonQuery(); |
1185 | 1200 | } |
1186 | 1201 | if (n > 0 && n % 500 == 0) |
@@ -1198,7 +1213,7 @@ | ||
1198 | 1213 | Console.WriteLine("\nSaving Segments..."); |
1199 | 1214 | foreach (Segment seg in m_Corpus.Segments) |
1200 | 1215 | { |
1201 | - cmd.CommandText = string.Format("INSERT INTO segment VALUES({0},{1},{2},{3},{4},{5},'',{6},{7},{8},'{9}',{10})", | |
1216 | + cmd.CommandText = string.Format("INSERT INTO segment VALUES({0},{1},{2},{3},{4},{5},'',{6},{7},{8},'{9}',{10},'')", | |
1202 | 1217 | seg.ID, seg.Tag.ID, seg.Version.ID, seg.Doc.ID, seg.StartChar, seg.EndChar, |
1203 | 1218 | m_Project.ID, 0/*user_id*/, |
1204 | 1219 | m_Service.GetDefault(), EscapeQuote(seg.Comment), seg.Sentence.ID); |
@@ -1227,7 +1242,7 @@ | ||
1227 | 1242 | Console.WriteLine("\nSaving Links..."); |
1228 | 1243 | foreach (Link lnk in m_Corpus.Links) |
1229 | 1244 | { |
1230 | - cmd.CommandText = string.Format("INSERT INTO link VALUES({0},{1},{2},{3},{4},{5},{6},{7},'{8}',{9},{10})", | |
1245 | + cmd.CommandText = string.Format("INSERT INTO link VALUES({0},{1},{2},{3},{4},{5},{6},{7},'{8}',{9},{10},'')", | |
1231 | 1246 | lnk.ID, lnk.Tag.ID, lnk.Version.ID, lnk.From.ID, lnk.To.ID, |
1232 | 1247 | m_Project.ID, 0/*user_id*/, m_Service.GetDefault(), EscapeQuote(lnk.Comment), |
1233 | 1248 | lnk.FromSentence.ID, lnk.ToSentence.ID); |
@@ -57,6 +57,13 @@ | ||
57 | 57 | DoJobs(cc); |
58 | 58 | } |
59 | 59 | } |
60 | + else if (cc.ReaderType == "Mecab|Cabocha|UniDic2|LUW") | |
61 | + { | |
62 | + // LUW(長単位)アノテーション付きのCabochaファイルの処理 | |
63 | + // Proj 0に通常のCabochaインポートを行った後、 | |
64 | + // 長単位アノテーションをProj 1に追加する2段階のインポート処理になる. | |
65 | + DoLUWJobs(cc); | |
66 | + } | |
60 | 67 | else |
61 | 68 | { |
62 | 69 | DoJobs(cc); |
@@ -120,6 +127,26 @@ | ||
120 | 127 | Console.WriteLine("Finished at {0}; Elapsed {1} minutes", t1.ToLocalTime(), elapsed.TotalMinutes); |
121 | 128 | } |
122 | 129 | |
130 | + static private void DoLUWJobs(CreateCorpus cc) | |
131 | + { | |
132 | + // 1. 通常のCabochaインポートを実行(Proj 0) | |
133 | + cc.ReaderType = "Mecab|Cabocha|UniDic2"; | |
134 | + DoJobs(cc); | |
135 | + // 2. LUWアノテーションから長単位Cabocha fileを一時的に作成 | |
136 | + Console.WriteLine("========================="); | |
137 | + Console.WriteLine("Extracting LUW part to temporary cabocha file..."); | |
138 | + var path = Path.GetTempFileName(); | |
139 | + LuwCabochaUtil.Convert(cc.InputPath, path); | |
140 | + Console.WriteLine($"Written to: {path}"); | |
141 | + // 3. 長単位cabocha fileをProj 1にインポート | |
142 | + Console.WriteLine("========================="); | |
143 | + Console.WriteLine("Importing temporary cabocha file of LUWs..."); | |
144 | + cc.ResetInternals(); | |
145 | + cc.InputPath = path; | |
146 | + cc.ProjectId = 1; | |
147 | + DoJobs(cc); | |
148 | + } | |
149 | + | |
123 | 150 | static void PrintUsage() |
124 | 151 | { |
125 | 152 | Console.WriteLine("Usage: CreateCorpus [Options] <InputFile> <Output>"); |