查找多个词典中的公共词典

【场景】:现在有4个词典:
1. wordmap.txt:训练语料的词典(由LDA-工具生成,这倒是一个很偷懒的方法);
2. CWembeddings-scaled.EMBEDDING_SIZE=50.txt,由Turian-ACL2010提供的50维词向量;
3. HLBL-embeddings-scaled.EMBEDDING_SIZE=50.,由Turian-ACL2010提供的50维词向量;
4. Senna-embeddings-words.lst,由Senna系统提供的50维词向量;

【需求】:
1. comWord.dic 要找出其中的公共词,生成词典;
2. 按照词典顺序输出对应的3个词向量保存到各自的文件中;Senna_embeddings.vec CW_embeddings.vec HLBL_embeddings.vec
3. 希望把有词向量的放到前面,没有词向量的放到后面。训练数据中的所有词都做保留。
【代码如下】:

  1. import java.io.BufferedReader;   
  2. import java.io.BufferedWriter;   
  3. import java.io.File;   
  4. import java.io.FileInputStream;   
  5. import java.io.FileOutputStream;   
  6. import java.io.IOException;   
  7. import java.io.InputStreamReader;   
  8. import java.io.OutputStreamWriter;   
  9. import java.util.ArrayList;   
  10. import java.util.HashMap;   
  11. import java.util.Map.Entry;   
  12.   
  13. public class FindComWords {   
  14.     public static void main(String[] args) throws IOException {   
  15.         boolean useSennaEmbedding = false;   
  16.         boolean useCWEmbedding = false;   
  17.         boolean useHLBLEmbedding = true;   
  18.         int usedVectorNum = 0;   
  19.         //读取训练语料的词典和3个词向量文件   
  20.         BufferedReader wordmapFile = new BufferedReader(new InputStreamReader(   
  21.                 new FileInputStream(new File(“./SearchSnippets/wordmap.txt”)), “UTF-8″));   
  22.         BufferedReader Senna_embeddings_wordsFile = null;   
  23.         BufferedReader Senna_embeddings_vectorFile = null;   
  24.         BufferedWriter Senna_embeddings_wordsFileW = null;   
  25.         if (useSennaEmbedding){   
  26.         Senna_embeddings_wordsFile = new BufferedReader(new InputStreamReader(   
  27.                 new FileInputStream(new File(“./WordEmbedding/Senna-embeddings-words.lst”)), “UTF-8″));   
  28.         Senna_embeddings_vectorFile = new BufferedReader(new InputStreamReader(   
  29.                 new FileInputStream(new File(“./WordEmbedding/Senna-embeddings.txt”)), “UTF-8″));   
  30.         Senna_embeddings_wordsFileW = new BufferedWriter(new OutputStreamWriter(   
  31.                 new FileOutputStream(new File(“Senna_embeddings.vec”)), “UTF-8″));   
  32.         }   
  33.         BufferedReader CWembeddingsFile = null;   
  34.         BufferedWriter CWembeddingsFileW = null;   
  35.         if (useCWEmbedding){   
  36.         CWembeddingsFile = new BufferedReader(new InputStreamReader(   
  37.                 new FileInputStream(new    
  38.                         File(“./WordEmbedding/CWembeddings-scaled.EMBEDDING_SIZE=50.txt”)), “UTF-8″));   
  39.         CWembeddingsFileW = new BufferedWriter(new OutputStreamWriter(   
  40.                 new FileOutputStream(new File(“CW_embeddings.vec”)), “UTF-8″));   
  41.         }   
  42.         BufferedReader HLBLembeddingsFile = null;   
  43.         BufferedWriter HLBLembeddingsFileW = null;   
  44.         if (useHLBLEmbedding) {   
  45.         HLBLembeddingsFile = new BufferedReader(new InputStreamReader(   
  46.                 new FileInputStream(new    
  47.                         File(“./WordEmbedding/HLBL-embeddings-scaled.EMBEDDING_SIZE=50.txt”)), “UTF-8″));   
  48.         HLBLembeddingsFileW = new BufferedWriter(new OutputStreamWriter(   
  49.                 new FileOutputStream(new File(“HLBL_embeddings.vec”)), “UTF-8″));   
  50.         }   
  51.         //生成公共词典和对应的词向量   
  52.         BufferedWriter comWordDicFile = new BufferedWriter(new OutputStreamWriter(   
  53.                 new FileOutputStream(new File(“comWord.dic”)), “UTF-8″));   
  54.         //保存 三个词向量中都没有的词汇   
  55.         BufferedWriter obscureWordFileW = new BufferedWriter(new OutputStreamWriter(   
  56.                 new FileOutputStream(new File(“obscureWord.lst”)), “UTF-8″));   
  57.            
  58.         HashMap<String, ArrayList<String>> dicMap = new HashMap<String, ArrayList<String>>();   
  59.         String tempLineStr;   
  60.         //先读取训练集的词典   
  61.         System.out.println(“Start read wordmapFile!”);   
  62.         while ((tempLineStr = wordmapFile.readLine()) != null) {   
  63.             String[] termDict = tempLineStr.split(“\\s+”);   
  64.             ArrayList<String> tmpVectorList = new ArrayList<String>();   
  65.             dicMap.put(termDict[0].trim(), tmpVectorList);   
  66.         }   
  67.         String tmpWordVector;   
  68.         int SennaIdx = 0;   
  69.         int CWIdx = 0;   
  70.         int HLBLIdx = 0;   
  71.         if (useSennaEmbedding) {   
  72.             //核实Senna词向量中的词典   
  73.             System.out.println(“Start read Senna_embeddings_wordsFile!”);   
  74.             while ((tempLineStr = Senna_embeddings_wordsFile.readLine()) != null) {   
  75.                 tmpWordVector = Senna_embeddings_vectorFile.readLine();   
  76.                 if (!dicMap.containsKey(tempLineStr.trim())) continue;   
  77.                 ArrayList<String> tmpVectorList = dicMap.get(tempLineStr.trim());   
  78.                 if (tmpVectorList.size()>usedVectorNum) {   
  79.                     System.out.println(“Error in Senna_embeddings_wordsFile:”+tempLineStr);   
  80.                     System.exit(0);   
  81.                 }   
  82.                 tmpVectorList.add(tmpWordVector);   
  83.                 dicMap.put(tempLineStr.trim(), tmpVectorList);   
  84.             }   
  85.             SennaIdx = usedVectorNum++;   
  86.         }   
  87.         if (useCWEmbedding) {   
  88.             //核实CW词向量中的词典   
  89.             System.out.println(“Start read CWembeddingsFile!”);   
  90.             while ((tempLineStr = CWembeddingsFile.readLine()) != null) {   
  91.                 String[] tmpWords = tempLineStr.split(“\\s+”);   
  92.                 tmpWordVector = tempLineStr.substring(tmpWords[0].length());   
  93.                 if (!dicMap.containsKey(tmpWords[0].trim())) continue;   
  94.                 ArrayList<String> tmpVectorList = dicMap.get(tmpWords[0].trim());   
  95.                 if (tmpVectorList.size()>usedVectorNum) {   
  96.                     System.out.println(“Error in CWembeddingsFile:”+tempLineStr);   
  97.                     System.exit(0);   
  98.                 }   
  99.                 tmpVectorList.add(tmpWordVector);   
  100.                 dicMap.put(tmpWords[0].trim(), tmpVectorList);   
  101.             }   
  102.             CWIdx = usedVectorNum++;   
  103.         }   
  104.         if (useHLBLEmbedding) {   
  105.             //核实HLBL词向量中的词典   
  106.             System.out.println(“Start read HLBLembeddingsFile!”);   
  107.             while ((tempLineStr = HLBLembeddingsFile.readLine()) != null) {   
  108.                 String[] tmpWords = tempLineStr.split(“\\s+”);   
  109.                 tmpWordVector = tempLineStr.substring(tmpWords[0].length());   
  110.                 if (!dicMap.containsKey(tmpWords[0].trim())) continue;   
  111.                 ArrayList<String> tmpVectorList = dicMap.get(tmpWords[0].trim());   
  112.                 if (tmpVectorList.size()>usedVectorNum) {   
  113.                     System.out.println(“Error in HLBLembeddingsFile:”+tempLineStr);   
  114.                     System.exit(0);   
  115.                 }   
  116.                 tmpVectorList.add(tmpWordVector);   
  117.                 dicMap.put(tmpWords[0].trim(), tmpVectorList);   
  118.             }   
  119.             HLBLIdx = usedVectorNum++;   
  120.         }   
  121.         //开始输出词典,索引号和对应的词向量   
  122.         System.out.println(“Start to generate dictionary, index and wordEmbedding!”);   
  123.         String wordStr;   
  124.         int wordIndex = 1;   
  125.         for(Entry<String, ArrayList<String>> entry:dicMap.entrySet()){     
  126.             wordStr = entry.getKey();   
  127.             ArrayList<String> wordVectorList = entry.getValue();   
  128.             if (wordVectorList.size()!=usedVectorNum) {   
  129.                 obscureWordFileW.write(wordStr+“\n”);//+ wordVectorList.size()+”\n”);   
  130.                 continue;   
  131.             }   
  132.             comWordDicFile.write(wordStr+“\n”);//+String.valueOf(wordIndex++)+”\n”);   
  133.             wordIndex++;   
  134.             if (useSennaEmbedding)   
  135.                 Senna_embeddings_wordsFileW.write(wordVectorList.get(SennaIdx)+“\n”);   
  136.             if (useCWEmbedding)   
  137.                 CWembeddingsFileW.write(wordVectorList.get(CWIdx)+“\n”);   
  138.             if (useHLBLEmbedding)   
  139.                 HLBLembeddingsFileW.write(wordVectorList.get(HLBLIdx)+“\n”);   
  140.         }   
  141.         System.out.println(“Info: the size of valid word list is:”+(wordIndex-1));   
  142.         System.out.println(“Info: the size of invalid word list is:”  
  143.                 +(dicMap.size() - (wordIndex-1)));   
  144.         wordmapFile.close();   
  145.         if (useSennaEmbedding){   
  146.             Senna_embeddings_wordsFile.close();   
  147.             Senna_embeddings_vectorFile.close();   
  148.             Senna_embeddings_wordsFileW.close();   
  149.         }   
  150.         if (useCWEmbedding) {   
  151.             CWembeddingsFile.close();   
  152.             CWembeddingsFileW.close();     
  153.         }   
  154.         if (useHLBLEmbedding) {   
  155.             HLBLembeddingsFile.close();   
  156.             HLBLembeddingsFileW.close();       
  157.         }   
  158.         comWordDicFile.close();   
  159.         obscureWordFileW.close();   
  160.     }   
  161. }   

发表评论

电子邮件地址不会被公开。 必填项已用 * 标注

*

您可以使用这些 HTML 标签和属性: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>