https://github.com/hlk-1135/Dictionary[code]public class SpellChecker { private static final char[] alphabets = "abcdefghijklmnopqrstuvwxyz".toCharArray(); public void start() throws IOException { //1.构建语言模型 String path = "E:\\big.txt"; Map languModel = buildLanguageModel(path); Set dictionary = languModel.keySet(); while((input = reader.readLine()) != null) { input = input.trim().toLowerCase(); if("bye".equals(input)) break; if(dictionary.contains(input)) continue; long startTime = System.currentTimeMillis(); //3.在编辑间隔内设置一个单词集,并删除字典中不存在的单词 Set wordsInEditDistance = buildEditDistance1Set(languModel, input); wordsInEditDistance.retainAll(dictionary); if(wordsInEditDistance.isEmpty()) { wordsInEditDistance = buildEditDistance2Set(languModel, input); wordsInEditDistance.retainAll(dictionary); if (wordsInEditDistance.isEmpty()) { System.out.println("Failed to check this word!"); continue; } } // 4.计算以是可能的概率 List guessWords = guessRightWord(languModel, wordsInEditDistance); System.out.printf("Do you want to input %s and Cost time: %.10f second(s)\n", guessWords.toString(), (System.currentTimeMillis() - startTime) / 1000D); } } /** * 读取语料库big.txt,构建模型 * @param path * @return * @throws IOException */ private Map buildLanguageModel(String path) throws IOException { Map languModel = new HashMap(); BufferedReader reader = new BufferedReader(new FileReader(path)); //去掉文档中除字母外的所有符号 Pattern pattern = Pattern.compile("[a-zA-Z]+"); String line; int totalCount = 0; while ((line = reader.readLine()) != null) { String[] words = line.split(" "); for(String word : words) { if(pattern.matcher(word).matches()) { word = word.toLowerCase(); Double wordCount = languModel.get(word); if(wordCount == null) { languModel.put(word, 1D); } else { languModel.put(word, wordCount+1D); } totalCount++; } } } reader.close(); for(Entry entry : languModel.entrySet()) entry.setValue(entry.getValue() / totalCount); return languModel; } /** * 编辑间隔为1的单词聚集 * @param languModel * @param input * @return */ private Set buildEditDistance1Set(Map languModel,String input) { Set wordsInEditDistance = new HashSet(); char[] characters = input.toCharArray(); // 删除:删除一个字母的情况,delete letter for(int i=0;i
欢迎光临 ToB企服应用市场:ToB评测及商务社交产业平台 (https://dis.qidao123.com/) | Powered by Discuz! X3.4 |