/* GamMo Simple Morphological Analyzer

  version 0.87
  Date 2007/05/15  
  Copyright (c) 2007 by knit  
  License: MIT License  
    http://www.opensource.org/licenses/mit-license.php  
    日本語訳    
    http://www.opensource.jp/licenses/mit-license.html    

  Sample
    <script type="text/javascript" src="gammo.js" charset="UTF-8"></script>  
 */    

function GamMo() {
  this.Version = "0.87";
  this.kanji = new RegExp();
  this.hiragana = new RegExp();
  this.katakana = new RegExp();
  this.word = new RegExp();
  this.zenkaku = new RegExp();
  this.ascii = new RegExp();
  this.kigoh = new RegExp();
  this.space = new RegExp();
  this.joshi = new RegExp();
  this.syuJoshi = new RegExp();
  // this.conj = new RegExp();
  this._compile();
}

GamMo.prototype.pattern = {
  "kanji": "([一-龠々〆ヵヶ]+)",
  "hiragana": "([ぁ-ん]+)",
  "katakana": "([ァ-ヴー]+)",
  "word": "([a-zA-Z0-9_]+)",
  "zenkaku": "([ａ-ｚＡ-Ｚ０-９]+)",
  "ascii": "([!-~]+)",
  "kigoh": "([、。！？（）「」『』’”!-\/:-@\[-\^`\{-~])",
  "space": "([　\\s\\n]+)"
  };


GamMo.prototype._compile = function() {
  this.kanji.compile(this.pattern.kanji, "g");
  this.hiragana.compile(this.pattern.hiragana, "g");
  this.katakana.compile(this.pattern.katakana, "g");
  this.word.compile(this.pattern.word, "g");
  this.zenkaku.compile(this.pattern.zenkaku, "g");
  this.ascii.compile(this.pattern.ascii, "g");
  this.kigoh.compile(this.pattern.kigoh, "g");
  this.space.compile(this.pattern.space, "g");
  this.joshi.compile("(か[もら]?|きり|くせに|[くぐ]らい|けれども?|[ただ]って|こそ|さえ|し[かも]?|すら|ずつ|たり|だ[けの]|ったら|ってば|つつ|[てで][はも]|とか?|ところ[がでか]|ながら|など|な[らり]|なん[かて]|の[でにみ]|ばかり|ほ[かど]|まで|ものの|やら?|ゆえ|より|をば?|[がてでにのはばへも])", "g");
  // this.joshi.compile("(から|まで|より|[がをにへでとのもは])", "g");
  // this.conj.compile("(だから|そこで|それゆえ|こうして|すると|しかし|けれども|ところが|しかるに|そして|それから|そのうえ|しかも|また|および|ならびに|ところで|され|それでは|では|するに|すなわち|つまり|ただし|なぜなら|もっとも|なお|ちなみに)");
  this.syuJoshi.compile("(か[いな]?|かしら|って|ったら|ってば|とも?|なあ?|ねえ?|ものか?|よう?|[さぜぞのやわ])([.?!、。？！]+)", "g");

}

GamMo.prototype.parse = function(str) {
  //option = 1;
  var result = [];
  str = str.replace(/｜/g, "|");
  str = str.replace(this.syuJoshi, "｜$1｜$2");
  str = str.replace(this.joshi, function(m0, m1) {
    var left = RegExp.leftContext.charAt(RegExp.leftContext.length -1);
    var right = RegExp.rightContext.charAt(0);
    //console.log(left + ":" + right);
    var result = "";
    // 同じ助詞の繰り返しがない場合
    if (m1 != right) {
      // 前後に区切りがあった場合
      if ((left == "｜") || (right == "｜")) {
        result = m1;
      } else {
        // if (m1 == "し") result += "｜";
        result += "｜" + m1 + "｜";
      }
    } else {
      result = m1;
    }
    return result;
  });
  //str = str.replace(this.conj, "｜$1｜");
  str = str.replace(this.kanji, "｜$1");
  str = str.replace(this.katakana, "｜$1｜");
  str = str.replace(this.word, "｜$1｜");
  str = str.replace(this.zenkaku, "｜$1｜");
  str = str.replace(this.space, "｜$1｜");
  result = this._fix(str).split("｜");

  return result;
}

GamMo.prototype._fix = function(str) {
  str = str.replace(/｜*([ぁぃぅぇぉっゃゅょ])｜*/g, "$1");
str = str.replace(/｜+(ん)/g, "$1");
  str = str.replace(this.kigoh, "｜$1｜");
  str = this.skip(str);
  str = str.replace(/^｜|(｜)+|｜$/g, "$1");
  return str;
}

GamMo.prototype.skip = function(str) {
  var word = ["とても", "でき", "つもり", "あたり", "です",
  "しい", "はな", "わずか", "はばかる", "さて", "かい"].sort();
  var re = [];
  for (var i = 0; i < word.length; i++) {
    var re = new RegExp("(" + word[i].split("").join("｜*") + ")", "g");
    str = str.replace(re, function(m0, m1) {
      var result = [];
      // if (m1 == "し") result.push("｜");
      result.push(m1.replace(/｜+/g, ""),  "｜");
      return result.join("");
    });
  }
  return str;
}

GamMo.prototype.Keyword = function(str) {
  var result = [];
  var words = this.parse(str);
  for (var i = 0, l = words.length; i < l; i++) {
    if (this.kanji.test(words[i]) || this.katakana.test(words[i])) {
      result.push(words[i]);
    } else if (this.ascii.test(words[i]) || this.zenkaku.test(words[i])) {
      if (words[i].length > 1) result.push(words[i]);
    }
    //result.push(words[i]);
  }
  return result;
}

GamMo.prototype.Kanji = function(str) {
  return str.match(this.kanji) || false;
}

GamMo.prototype.Hiragana = function(str) {
  return str.match(this.hiragana) || false;
}

GamMo.prototype.Katakana = function(str) {
  return str.match(this.katakana) || false;
}

GamMo.prototype.Word = function(str) {
  return str.match(this.word) || false;
}

GamMo.prototype.Zenkaku = function(str) {
  return str.match(this.zenkaku) || false;
}

GamMo.prototype.Ascii = function(str) {
  return str.match(this.ascii) || false;
}

GamMo.prototype.Kigoh = function(str) {
  return str.match(this.kigoh) || false;
}

GamMo.prototype.Space = function(str) {
  return str.match(this.space) || false;
}

GamMo.prototype.Joshi = function(str) {
  return str.match(this.joshi) || false;
}



