• R/O
  • SSH
  • HTTPS

chaki: Commit


Commit MetaInfo

Revisión646 (tree)
Tiempo2021-01-05 22:13:07
Autortomorita

Log Message

長単位付きCabochaファイルを"Mecab|Cabocha|UniDic2|LUW"形式でインポートする処理を追加

Cambiar Resumen

Diferencia incremental

--- trunk/ChaKi.NET/src/CreateCorpusSLA/CreateCorpus.cs (revision 645)
+++ trunk/ChaKi.NET/src/CreateCorpusSLA/CreateCorpus.cs (revision 646)
@@ -46,7 +46,21 @@
4646 private LexiconBuilder m_LexBuilder = null;
4747 private bool m_IsFolderInput;
4848
49+ public void ResetInternals()
50+ {
51+ m_Service = null;
52+ m_Corpus = null;
53+ m_DefaultString = null;
4954
55+ m_SentencesInDocuments = new List<int>();
56+ m_DocumentSet = null;
57+ m_DocumentSetProjectMapping = null;
58+ m_User = null;
59+ m_Project = null;
60+ m_LexBuilder = null;
61+
62+ }
63+
5064 /// <summary>
5165 /// CreateCorpusの一連の処理を実行してDatabaseを作成する.
5266 /// 呼び出し前にパラメータフィールドが適切に設定されている必要がある.
@@ -821,7 +835,7 @@
821835 {
822836 if (pos.ID < 0)
823837 {
824- pos.ID = ++ m_LexBuilder.LastLexemeID;
838+ pos.ID = ++m_LexBuilder.LastLexemeID;
825839 }
826840 cmd.CommandText = string.Format("INSERT INTO part_of_speech VALUES({0},'{1}','{2}','{3}','{4}','{5}')",
827841 pos.ID, EscapeQuote(pos.Name1), EscapeQuote(pos.Name2), EscapeQuote(pos.Name3), EscapeQuote(pos.Name4), EscapeQuote(pos.Name));
@@ -1176,11 +1190,12 @@
11761190 foreach (Word word in sen.Words)
11771191 {
11781192 word.ID = wordid++;
1179- cmd.CommandText = string.Format("INSERT INTO word VALUES ({0},{1},{2},{3},{4},{5},{6},{7},null,null,null,'{8}',{9})",
1193+ cmd.CommandText = string.Format("INSERT INTO word VALUES ({0},{1},{2},{3},{4},{5},{6},{7},null,null,null,'{8}',{9},{10})",
11801194 word.ID, word.Sen.ID, word.StartChar, word.EndChar, word.Lex.ID,
11811195 (word.Bunsetsu == null) ? 0 : word.Bunsetsu.ID, m_DefaultString, word.Pos,
11821196 word.Extras,
1183- m_Project.ID);
1197+ m_Project.ID,
1198+ (int)word.HeadInfo);
11841199 cmd.ExecuteNonQuery();
11851200 }
11861201 if (n > 0 && n % 500 == 0)
@@ -1198,7 +1213,7 @@
11981213 Console.WriteLine("\nSaving Segments...");
11991214 foreach (Segment seg in m_Corpus.Segments)
12001215 {
1201- cmd.CommandText = string.Format("INSERT INTO segment VALUES({0},{1},{2},{3},{4},{5},'',{6},{7},{8},'{9}',{10})",
1216+ cmd.CommandText = string.Format("INSERT INTO segment VALUES({0},{1},{2},{3},{4},{5},'',{6},{7},{8},'{9}',{10},'')",
12021217 seg.ID, seg.Tag.ID, seg.Version.ID, seg.Doc.ID, seg.StartChar, seg.EndChar,
12031218 m_Project.ID, 0/*user_id*/,
12041219 m_Service.GetDefault(), EscapeQuote(seg.Comment), seg.Sentence.ID);
@@ -1227,7 +1242,7 @@
12271242 Console.WriteLine("\nSaving Links...");
12281243 foreach (Link lnk in m_Corpus.Links)
12291244 {
1230- cmd.CommandText = string.Format("INSERT INTO link VALUES({0},{1},{2},{3},{4},{5},{6},{7},'{8}',{9},{10})",
1245+ cmd.CommandText = string.Format("INSERT INTO link VALUES({0},{1},{2},{3},{4},{5},{6},{7},'{8}',{9},{10},'')",
12311246 lnk.ID, lnk.Tag.ID, lnk.Version.ID, lnk.From.ID, lnk.To.ID,
12321247 m_Project.ID, 0/*user_id*/, m_Service.GetDefault(), EscapeQuote(lnk.Comment),
12331248 lnk.FromSentence.ID, lnk.ToSentence.ID);
--- trunk/ChaKi.NET/src/CreateCorpusSLA/Program.cs (revision 645)
+++ trunk/ChaKi.NET/src/CreateCorpusSLA/Program.cs (revision 646)
@@ -57,6 +57,13 @@
5757 DoJobs(cc);
5858 }
5959 }
60+ else if (cc.ReaderType == "Mecab|Cabocha|UniDic2|LUW")
61+ {
62+ // LUW(長単位)アノテーション付きのCabochaファイルの処理
63+ // Proj 0に通常のCabochaインポートを行った後、
64+ // 長単位アノテーションをProj 1に追加する2段階のインポート処理になる.
65+ DoLUWJobs(cc);
66+ }
6067 else
6168 {
6269 DoJobs(cc);
@@ -120,6 +127,26 @@
120127 Console.WriteLine("Finished at {0}; Elapsed {1} minutes", t1.ToLocalTime(), elapsed.TotalMinutes);
121128 }
122129
130+ static private void DoLUWJobs(CreateCorpus cc)
131+ {
132+ // 1. 通常のCabochaインポートを実行(Proj 0)
133+ cc.ReaderType = "Mecab|Cabocha|UniDic2";
134+ DoJobs(cc);
135+ // 2. LUWアノテーションから長単位Cabocha fileを一時的に作成
136+ Console.WriteLine("=========================");
137+ Console.WriteLine("Extracting LUW part to temporary cabocha file...");
138+ var path = Path.GetTempFileName();
139+ LuwCabochaUtil.Convert(cc.InputPath, path);
140+ Console.WriteLine($"Written to: {path}");
141+ // 3. 長単位cabocha fileをProj 1にインポート
142+ Console.WriteLine("=========================");
143+ Console.WriteLine("Importing temporary cabocha file of LUWs...");
144+ cc.ResetInternals();
145+ cc.InputPath = path;
146+ cc.ProjectId = 1;
147+ DoJobs(cc);
148+ }
149+
123150 static void PrintUsage()
124151 {
125152 Console.WriteLine("Usage: CreateCorpus [Options] <InputFile> <Output>");
Show on old repository browser