|
发表于 2007-6-20 15:34:17
|
显示全部楼层
這類排列組合,在編製分詞詞庫(字典檔)或機器人語彙檔時,也常實際使用(如中、英文人的姓和名批次組合,有一定的準確率),但是在編寫的過程中,才發現中文的特性(不敢用"博大精深")。
每個語文有其文法語意結構的特性。這才是NLP注意的重點(如目前已經商業化多國語的翻譯機)
形態類似醬↓
- # Numbers
- $numbers = "零○一二三四五六七八九十百千萬億0123456789.點第";
- $numbers .= "多半數幾倆卅兩壹貳三肆伍陸柒捌玖拾伯仟";
- for ($n = 0; $n < length($numbers); $n+=2) {
- $cnumbers{substr($numbers, $n, 2)} = 1;
- }
- # Wide ASCII words
- $wascii = "abcdefghijklmnopqrstuvwxyz.";
- $wascii .= "ABCDEFGHIJKLMNOPQRSTUVWXYZ-";
- $wascii .= "";
- for ($n = 0; $n < length($wascii); $n+=2) {
- $cascii{substr($wascii, $n, 2)} = 1;
- }
- # Foreign name transliteration characters
- $foreign = "阿埃艾愛安奧澳巴貝本比賓波博伯卜布茨達大戴德登迪蒂丁都多俄厄爾法菲費芬";
- $foreign .= "夫福弗佛蓋岡哥戈格根古哈海合赫胡華霍基吉及加伽賈傑金喀卡凱柯科可克庫拉";
- $foreign .= "萊來賴蘭勞勒雷累黎裡利烈林盧魯倫羅洛馬麥邁曼蒙米摩莫墨姆穆那納乃";
- $foreign .= "內尼努諾帕佩蓬皮匹普奇齊喬切冉薩塞桑森沙捨什史士斯索塔泰坦特圖土托瓦萬";
- $foreign .= "維溫沃烏伍西希謝辛休遜雅亞延耶伊印尤澤扎詹諸茲腓胥";
- for ($n = 0; $n < length($foreign); $n+=2) {
- $cforeign{substr($foreign, $n, 2)} = 1;
- }
- #Chinese surnames
- $surname = "艾安敖白班包寶保鮑貝畢邊卞柏卜蔡曹岑柴昌常陳程遲池褚楚";
- $surname .= "儲淳崔戴刀鄧狄刁丁董竇杜端段樊范方房斐費豐封馮鳳伏福傅蓋甘";
- $surname .= "高戈耿龔宮勾苟辜谷古顧官關管桂郭杭郝禾何賀赫衡洪侯胡花";
- $surname .= "華黃霍稽姬吉紀季賈簡翦姜江蔣焦晉金靳荊居康柯空孔匡鄺況藍";
- $surname .= "郎朗勞樂雷冷黎李理厲利勵廉練良梁廖林凌劉柳隆龍樓婁盧呂魯";
- $surname .= "陸倫羅洛駱麻馬麥滿茅毛梅孟米苗繆閔莫牟穆倪聶鈕農潘龐";
- $surname .= "裴彭皮樸平蒲溥浦戚祁齊錢強喬秦丘邱仇裘屈瞿冉饒任榮容阮";
- $surname .= "瑞芮薩賽沙單商邵佘申沈盛石史壽舒斯宋蘇孫邰譚談湯唐陶滕";
- $surname .= "田佟仝屠塗萬汪王危韋魏衛蔚溫聞翁巫鄔伍武吳奚習夏鮮冼";
- $surname .= "項蕭解謝辛邢幸熊徐許宣薛荀顏閻言嚴彥晏燕楊陽姚葉蟻易殷銀尹";
- $surname .= "游尤於魚虞俞余禹喻郁尉元袁岳雲臧曾查翟詹湛張章招趙甄";
- $surname .= "鄭鐘周諸朱竺祝莊卓宗鄒祖左";
- $uncommonsurname = "車成全韓賴連路明牛權時水文席應英於"; # 和同
- for ($n = 0; $n < length($surname); $n+=2) {
- $csurname{substr($surname, $n, 2)} = 1;
- }
- for ($n = 0; $n < length($uncommonsurname); $n+=2) {
- $uncommoncsurname{substr($uncommonsurname, $n, 2)} = 1;
- }
- # Add in 2 character surnames; also add to lexicon so they'll be segmented as one unit
- $csurname{"東郭"} = 1; $cwords{"東郭"} = 1;
- $csurname{"公孫"} = 1; $cwords{"公孫"} = 1;
- $csurname{"皇甫"} = 1; $cwords{"皇甫"} = 1;
- $csurname{"慕容"} = 1; $cwords{"慕容"} = 1;
- $csurname{"歐陽"} = 1; $cwords{"歐陽"} = 1;
- $csurname{"單于"} = 1; $cwords{"單于"} = 1;
- $csurname{"司空"} = 1; $cwords{"司空"} = 1;
- $csurname{"司馬"} = 1; $cwords{"司馬"} = 1;
- $csurname{"司徒"} = 1; $cwords{"司徒"} = 1;
- $csurname{"澹台"} = 1; $cwords{"澹台"} = 1;
- $csurname{"諸葛"} = 1; $cwords{"諸葛"} = 1;
- $punctuation .= "、:,。★【】()⊙~【】「」—‧?!「」 ";
- #Not in name
- $notname = "的說對在和是被最所那這有將會與於他為鎮";
- $notname .= $punctuation;
- for ($n = 0; $n < length($notname); $n+=2) {
- $cnotname{substr($notname, $n, 2)} = 1;
复制代码
[ 本帖最后由 Artvine 于 2007-6-20 15:44 编辑 ] |
|