处理数据集4DCNN-ACL2014

DCNN-2014的数据格式:

数据变量名 大小 说明 完成度
index 53714个词 (comWord.dic) 字符串词典,尽量保证训练/测试集及词向量中都有 OK
sent_length 40 (实际最大长度38) 文本最大长度 OK
size_vocab 53714+1 字典的大小+1个空白词 OK
test 2280*sent_length 测试集数据 OK
test_lbl 可以没有64位Hash标签 {可无64位Hash标签}{文本长度},由Spectral训练
train 10060*sent_length 训练集数据 OK
train_lbl 要64个类别标签 {64位Hash标签}{文本长度},由Spectral训练
vocab_emb_1_CW 50*词典维数 从外部拿到的词向量,覆盖的词向量太少 暂不考虑
vocab_emb_2_HLBL 50*词典维数 如果叠加的话,覆盖的词向量太少 暂不考虑
vocab_emb_3_Senna 50*词典维数 从外部拿到的词向量, 考虑是否除以/std(A) OK

Format:【SearchSnippets数据集介绍】:[下载地址], [备用下载地址]
Training and test data as Google search snippets
Traning data: — File: train.txt — Size: 10,060 snippets
Test data: — File: test.txt — Size: 2,280 snippets

Each snippet on each line
Each snippet consists of a list of words/terms plus a class label at the end

8 domains (class labels):
Business, Computers, Culture-Arts-Entertainment, Education-Science, Engineering, Health, Politics-Society, Sports

【Step1. 生成词典】:比较方便的Trick是采用 把train.txt和test.txt的内容贴到一个txt中,然后跑一下LDA,会自动生成wordmap.txt文件. [LDA相关地址], [备用LDA工具]
LDA启动指令格式:java -jar jgibblda.jar -est -alpha 0.5 -beta 0.1 -ntopics 400 -dir ./ -dfile train.txt 详见[http://jacoxu.com/?p=648]
但是SearchSnippets的数据是从Google的结果内容中摘的,文本中有很多噪音。我们可以从相关的Wiki中生成一个比较好的词典。SearchSnippets的作者提供了 [对应的Wiki语料], [备选下载地址]。

不过自己写一个抽词典的程序可能用起来会方便一点,这里一定要注意编码格式的问题,尽量使用(UTF-8和Unix):(注意,test464行最后有乱码问题windows vista鈩?computers,一种方式是把手动把乱码去掉,改为windows vista computers,一种是置之不管,然后标记标签时注意一下。仅此一次影响不大,为了以后简便统一,我们这里按照第二种方式保留)

  1. package Process4DCNN;   
  2. import java.io.BufferedReader;   
  3. import java.io.BufferedWriter;   
  4. import java.io.File;   
  5. import java.io.FileInputStream;   
  6. import java.io.FileOutputStream;   
  7. import java.io.InputStreamReader;   
  8. import java.io.OutputStreamWriter;   
  9. import java.util.HashSet;   
  10. import java.util.Iterator;   
  11.   
  12. public class Step1_GenerateDict4SearchSnippets {   
  13.     public static void main(String[] args) throws Exception {   
  14.         //读取SearchSnippets数据集   
  15.         String searchSnippetPathStr = “./Data/SearchSnippets/all-test+train.txt”;   
  16.         String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;   
  17.            
  18.         BufferedReader corpusFile = new BufferedReader(new InputStreamReader(   
  19.                 new FileInputStream(new File(searchSnippetPathStr)), “UTF-8″));   
  20.         BufferedWriter wordMapFileW = new BufferedWriter(new OutputStreamWriter(   
  21.                 new FileOutputStream(new File(comWordWithIdxPathStr)), “UTF-8″));   
  22.         String tempLine;   
  23.         //开始读取数据   
  24.         int wordReadNum = 0;   
  25.         int wordWriteNum = 0;   
  26.         HashSet<String> wordSet = new HashSet<String>();   
  27.         System.out.println(“Start to read wordSet …”);   
  28.         while ((tempLine = corpusFile.readLine()) != null) {   
  29.             String[] wordArraysStr = tempLine.split(“\\s+”);   
  30.             //最后一个为Word为Label   
  31.             for (int i = 0; i < wordArraysStr.length-1; i++) {   
  32.                 String tmpWord = wordArraysStr[i].trim();   
  33.                 if (wordSet.contains(tmpWord)) continue;   
  34.                 else {   
  35.                     wordSet.add(tmpWord);   
  36.                     wordReadNum++;   
  37.                 }   
  38.             }   
  39.         }   
  40.         System.out.println(“Start to output wordSet …”);   
  41.         Iterator<String> iterator=wordSet.iterator();   
  42.         while(iterator.hasNext()){   
  43.             wordWriteNum++;   
  44.             wordMapFileW.write(iterator.next()+“\t”+wordWriteNum+“\n”);   
  45.         }   
  46.         System.out.println(“wordReadNum:” +wordReadNum);   
  47.         System.out.println(“wordWriteNum:” +wordWriteNum);   
  48.         if (wordWriteNum!=wordReadNum) {   
  49.             System.out.println(“Error! wordWriteNum is diffent with wordReadNum”);   
  50.         }   
  51.         corpusFile.close();   
  52.         wordMapFileW.close();   
  53.     }   
  54. }  

【Step2. 词典过滤文本】
[输入]:词典:comWord.dic,输入文本:train.txt
[输出]:过滤后的文本 train_refined.txt, 若过滤后无内容则记录无内容行号 及 过滤后的最大文本长度maxLen 输入到train__refined_Info.lst文件中。相应代码如下:

  1. package Process4DCNN;   
  2. import java.io.BufferedReader;   
  3. import java.io.BufferedWriter;   
  4. import java.io.File;   
  5. import java.io.FileInputStream;   
  6. import java.io.FileOutputStream;   
  7. import java.io.IOException;   
  8. import java.io.InputStreamReader;   
  9. import java.io.OutputStreamWriter;   
  10. import java.util.HashSet;   
  11.   
  12. public class Step2_RefineDocs4SearchSnippets {   
  13.   
  14.     public static void main(String[] args) throws IOException {   
  15.         //读取词典和要进行过滤的文本   
  16.         String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;   
  17.         String searchSnippetPathStr = “./Data/SearchSnippets/all-test+train.txt”;   
  18.         BufferedReader comWordFile = new BufferedReader(new InputStreamReader(   
  19.                 new FileInputStream(new File(comWordWithIdxPathStr)), “UTF-8″));   
  20.         BufferedReader srcRawFile = new BufferedReader(new InputStreamReader(   
  21.                 new FileInputStream(new File(searchSnippetPathStr)), “UTF-8″));   
  22.         HashSet<String> dicSet = new HashSet<String>();   
  23.         //输出过滤后的文本 及 过滤后无内容的行号   
  24.         String allRefinedFileStr = “./Data/SearchSnippets/all_refined.txt”;   
  25.         String invalidDocFileWStr = “./Data/SearchSnippets/all_refined_Info.lst”;   
  26.         BufferedWriter trainRefinedFile = new BufferedWriter(new OutputStreamWriter(   
  27.                 new FileOutputStream(new File(allRefinedFileStr)), “UTF-8″));   
  28.         BufferedWriter invalidDocFileW = new BufferedWriter(new OutputStreamWriter(   
  29.                 new FileOutputStream(new File(invalidDocFileWStr)), “UTF-8″));   
  30.         //读取词典   
  31.         String tempLineStr;   
  32.         System.out.println(“Start to load the dictionary!”);   
  33.         while ((tempLineStr = comWordFile.readLine()) != null) {   
  34.             String[] tmpWords = tempLineStr.split(“\t”);   
  35.             dicSet.add(tmpWords[0].trim());   
  36.         }   
  37.         //利用词典进行文本过滤,同时记录文本的最大长度maxLen   
  38.         System.out.println(“Start to refine the raw documents!”);   
  39.         int maxLen = 0;   
  40.         int lineNum = 1;   
  41.         while ((tempLineStr = srcRawFile.readLine()) != null) {   
  42.             int tmpLen = 0;   
  43.             String[] tmpWords = tempLineStr.split(“\\s+”);   
  44.             //由于SearchSnippet数据的最后一项是标签文本,因而直接扔掉   
  45.             for (int i = 0; i < tmpWords.length-1; i++) {   
  46.                 String tmpWord = tmpWords[i];   
  47.                 //如果词典里面有,则留下,如果没有则放弃   
  48.                 if (dicSet.contains(tmpWord.trim())) {   
  49.                     trainRefinedFile.write(tmpWord.trim()+“ ”);   
  50.                     tmpLen++;   
  51.                 }else {   
  52.                     System.out.println(“Error in line:”+lineNum+“, and the word is:”+tmpWord.trim());   
  53.                 }   
  54.             }   
  55.             if (tmpLen==0) {   
  56.                 //如果过滤后没有内容了,则标记好行号   
  57.                 invalidDocFileW.write(lineNum+“\n”);   
  58.                 trainRefinedFile.write(“ \n”); //暂时保留了空白行   
  59.             }else {   
  60.                 if (tmpLen > maxLen) maxLen = tmpLen;   
  61. //              if (tmpLen>200) {   
  62. //                  invalidDocFileW.write(“Overflowed:”+ lineNum+”, with length:”+ tmpLen+”\n”);               
  63. //              }   
  64.                 trainRefinedFile.write(“\n”);   
  65.             }   
  66.             lineNum++;   
  67.         }   
  68.         System.out.println(“Info: maxLen is ”+maxLen);   
  69.         invalidDocFileW.write(“Info: maxLen is ”+maxLen);   
  70.         comWordFile.close();   
  71.         srcRawFile.close();   
  72.         trainRefinedFile.close();   
  73.         invalidDocFileW.close();   
  74.     }   
  75. }  

【Step3. 生成数据集的标签文件】:
输入:原始文本。输出:每条text对应的标签序号。

  1. package Process4DCNN;   
  2. import java.io.BufferedReader;   
  3. import java.io.BufferedWriter;   
  4. import java.io.File;   
  5. import java.io.FileInputStream;   
  6. import java.io.FileOutputStream;   
  7. import java.io.IOException;   
  8. import java.io.InputStreamReader;   
  9. import java.io.OutputStreamWriter;   
  10. import java.util.HashMap;   
  11.   
  12. public class Step3_CreateIndx4SearchSnippets {   
  13.   
  14.     public static void main(String[] args) throws IOException {   
  15.         HashMap<String, Integer> tagMap = new HashMap<String, Integer>();   
  16.         String searchSnippetPathStr = “./Data/SearchSnippets/all-test+train.txt”;   
  17.         BufferedReader srcRowFile = new BufferedReader(new InputStreamReader(   
  18.                 new FileInputStream(new File(searchSnippetPathStr)), “UTF-8″));   
  19.         //输出过滤后的文本 及 过滤后无内容的行号   
  20.         String allIndxFileStr = “./Data/SearchSnippets/all_indx.txt”;          
  21.         BufferedWriter trainIndxFile = new BufferedWriter(new OutputStreamWriter(   
  22.                 new FileOutputStream(new File(allIndxFileStr)), “UTF-8″));   
  23.         tagMap.put(“business”1);   
  24.         tagMap.put(“computers”2);   
  25.         tagMap.put(“culture-arts-entertainment”3);   
  26.         tagMap.put(“education-science”4);   
  27.         tagMap.put(“engineering”5);   
  28.         tagMap.put(“health”6);   
  29.         tagMap.put(“politics-society”7);   
  30.         tagMap.put(“sports”8);       
  31.         //读取词典   
  32.         String tempLineStr;   
  33.         System.out.println(“Start to refine the raw documents!”);   
  34.         int lineNum = 1;   
  35.         while ((tempLineStr = srcRowFile.readLine()) != null) {   
  36.             String[] tmpWords = tempLineStr.split(“\\s+”);   
  37.             //由于SearchSnippet数据的最后一项是标签文本   
  38.             if (tagMap.containsKey(tmpWords[tmpWords.length-1].trim())) {   
  39.                 trainIndxFile.write(tagMap.get(tmpWords[tmpWords.length-1].trim())+“\n”);   
  40.             }else {   
  41.                 System.out.println(“Error in line:”+lineNum+“, and the tag is:”+tmpWords[0].trim());   
  42.                 //这里特殊处理一下   
  43.                 trainIndxFile.write(tagMap.get(“computers”)+“\n”);   
  44.             }   
  45.             lineNum++;   
  46.         }   
  47.         System.out.println(“OK!”);   
  48.         trainIndxFile.close();   
  49.         srcRowFile.close();   
  50.     }   
  51. }  

【Step4. 生成向量空间模型VSM】:
输入:过滤掉标签的文本,词典。输出:VSM

  1. package Process4DCNN;   
  2. import java.io.BufferedReader;   
  3. import java.io.BufferedWriter;   
  4. import java.io.File;   
  5. import java.io.FileInputStream;   
  6. import java.io.FileOutputStream;   
  7. import java.io.IOException;   
  8. import java.io.InputStreamReader;   
  9. import java.io.OutputStreamWriter;   
  10. import java.util.ArrayList;   
  11. import java.util.HashMap;   
  12.   
  13. public class Step4_Process4STHFormat4SearchSnippets {   
  14.     public static void main(String[] args) throws Exception {   
  15.         //Test   
  16. //      String tempLine =”宸濄伄娴併倢銇倛銇?28521″;   
  17. //      String[] tokensStr  = tempLine.split(“\t”);   
  18.         //利用纯文本和wordmap构建基于词频的向量空间模型,用于STH预处理   
  19.         //all = [test_data;train_data]!   
  20.         String allRefinedFileStr = “./Data/SearchSnippets/all_refined.txt”;   
  21.         String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;   
  22.         String vsmTextStr = “./Data/SearchSnippets/vsmOfall.dic”;   
  23.         BufferedReader sourceTextRD = new BufferedReader(   
  24.                 new InputStreamReader(new FileInputStream(new File(allRefinedFileStr)), “UTF-8″));   
  25.         BufferedReader wordMapRD = new BufferedReader(   
  26.                 new InputStreamReader(new FileInputStream(new File(comWordWithIdxPathStr)), “UTF-8″));   
  27.         BufferedWriter vsmBW = new BufferedWriter(   
  28.                 new OutputStreamWriter(new FileOutputStream(new File(vsmTextStr)), “UTF-8″));   
  29.         //构造VSM词频向量空间模型   
  30.         creatVSMText(sourceTextRD,wordMapRD,vsmBW);   
  31.            
  32.         sourceTextRD.close();   
  33.         wordMapRD.close();   
  34.         vsmBW.close();         
  35.         System.out.println(“It is done, ok!”);   
  36.     }   
  37.        
  38.     public static void creatVSMText(BufferedReader sourceTextRD,   
  39.             BufferedReader wordMapRD, BufferedWriter vsmBW) throws IOException, Exception {   
  40.         System.out.println(“Start to create VSM …!”);   
  41.         String tempLine;   
  42.         //先读入词典   
  43.         int wordIdxNum = 0;   
  44.         HashMap<String, Integer> wordMap = new HashMap<String,Integer>();   
  45.         while ((tempLine = wordMapRD.readLine()) != null) {   
  46.             //词典中放着词和索引号,索引号从1-30642   
  47.             if (wordMap.containsKey(tempLine.trim())) {   
  48.                 System.out.println(“Test, the word is replicate:”+tempLine.trim());   
  49.             }   
  50.             if (tempLine.trim().length()==0continue;   
  51.             //wordMap.put(tempLine.trim(), wordIdxNum);   
  52.             wordMap.put(tempLine.split(“\\s+”)[0].trim(), Integer.valueOf(tempLine.split(“\\s+”)[1]));     
  53.             wordIdxNum++;   
  54.         }   
  55.         //定义了这个数据集的特征维数,注意具有数据集独立化   
  56.         int dimVector = wordIdxNum;   
  57.         System.out.println(“Has read the dictionary, the size is:”+wordMap.size());   
  58.         ArrayList<Integer> wordFreqList = new ArrayList<Integer>();   
  59.         int lineNum = 1;   
  60.         boolean hasWordFeature = false;   
  61.         while ((tempLine = sourceTextRD.readLine()) != null) {   
  62.             //因为数据比较多时,VSM会异常的大,我们分批处理!   
  63. //          if (!(150001<=lineNum)) { //【设置了下限    
  64. //              lineNum++;   
  65. //              continue;   
  66. //          }   
  67. //          if (!(lineNum<=155000)) { //设置了上限 】   
  68. //              break;   
  69. //          }   
  70.             hasWordFeature = false;   
  71.             //读入一行,即一个文档;   
  72.             wordFreqList.clear();   
  73.             for (int i = 0; i < dimVector; i++) {   
  74.                 wordFreqList.add(0);   
  75.             }   
  76. //          if (lineNum == 464) {   
  77. //              System.out.println(“Test”);   
  78. //          }   
  79.             String[] tokensStr  = tempLine.split(“\\s+”);   
  80.             ///for (String tempToken: tokensStr) {   
  81.             for (int j = 0; j < tokensStr.length; j++) {   
  82.                 String tempToken = tokensStr[j];   
  83.                 if (wordMap.containsKey(tempToken.trim())) {   
  84.                     hasWordFeature = true;   
  85.                     int index = wordMap.get(tempToken.trim());   
  86.                     if (index>dimVector) {   
  87.                         System.out.print(“Error, and the word is: ”+tempToken.trim());   
  88.                     }   
  89.                     wordFreqList.set(index-1, wordFreqList.get(index-1)+1);   
  90.                 }else {   
  91.                     if (lineNum > 7532) {   
  92.                         System.out.println(“Warning:–”);   
  93.                     }   
  94.                     System.out.println(“error: the map has not contain the word:”  
  95.                             +tempToken+“ in Line:”+lineNum);   
  96.                 }   
  97.             }   
  98.             for (int tempFreq:wordFreqList) {   
  99.                 vsmBW.write(String.valueOf(tempFreq)+“ ”);   
  100.             }   
  101.             vsmBW.write(“\n”);   
  102.             if (!hasWordFeature) {   
  103.                 System.out.println(“++++++++++”+“has no word in Line:”+lineNum+“++++++++++”);   
  104.             }   
  105.             lineNum++;   
  106.         }   
  107.     }   
  108. }   

【Step 5. 生成DCNN格式的训练语料和测试语料】:
输入:训练文本train_refined.txt, 测试文本 test_refined.txt,公共词典, comWord.dic,固定最大长度40
输出:按照词典的Index进行标注,不够最大长度的用size(comWord)+1 填充。生成train_DCNN.txt, test_DCNN.txt,并输出对应的文本长度到train_lbl.txt和test_lbl.txt文件中:

  1. import java.io.BufferedReader;   
  2. import java.io.BufferedWriter;   
  3. import java.io.File;   
  4. import java.io.FileInputStream;   
  5. import java.io.FileOutputStream;   
  6. import java.io.IOException;   
  7. import java.io.InputStreamReader;   
  8. import java.io.OutputStreamWriter;   
  9. import java.util.HashMap;   
  10.   
  11. public class Transform2DCNNFormat {   
  12.   
  13.     public static void main(String[] args) throws IOException {   
  14.         //读取词典和语料文本   
  15.         BufferedReader comWordFile = new BufferedReader(new InputStreamReader(   
  16.                 new FileInputStream(new File(“./SearchSnippets/comWord.dic”)), “UTF-8″));   
  17.         BufferedReader srcRawFile = new BufferedReader(new InputStreamReader(   
  18.                 new FileInputStream(new File(“./SearchSnippets/test_refined.txt”)), “UTF-8″));   
  19.         HashMap<String, Integer> dicMap = new HashMap<String, Integer>();   
  20.            
  21.         //输出DCNN格式的语料文本和文本长度   
  22.         BufferedWriter trainDCNNFile = new BufferedWriter(new OutputStreamWriter(   
  23.                 new FileOutputStream(new File(“test_DCNN.txt”)), “UTF-8″));   
  24.         BufferedWriter trainlblFileW = new BufferedWriter(new OutputStreamWriter(   
  25.                 new FileOutputStream(new File(“test_lbl.txt”)), “UTF-8″));   
  26.         int maxLen = 40;   
  27.            
  28.         //读取词典   
  29.         String tempLineStr;   
  30.         System.out.println(“Start to load the dictionary!”);   
  31.         while ((tempLineStr = comWordFile.readLine()) != null) {   
  32.             String[] tmpWords = tempLineStr.split(“\\s+”);   
  33.             dicMap.put(tmpWords[0].trim(), Integer.valueOf(tmpWords[1]));   
  34.         }   
  35.         //利用词典进行文本转化,同时记录文本的长度Len   
  36.         System.out.println(“Start to transform the raw documents!”);   
  37.         int lineNum = 1;   
  38.         while ((tempLineStr = srcRawFile.readLine()) != null) {   
  39.             int tmpLen = 0;   
  40.             String[] tmpWords = tempLineStr.split(“\\s+”);   
  41.             for (int i = 0; i < tmpWords.length; i++) {   
  42.                 if (dicMap.containsKey(tmpWords[i].trim())) {   
  43.                     trainDCNNFile.write(dicMap.get(tmpWords[i].trim())+“ ”);                   
  44.                 }else {   
  45.                     System.out.println(“Error in line: ”+lineNum);   
  46.                 }   
  47.                 tmpLen++;   
  48.             }   
  49.             if (tmpLen>=maxLen) System.out.println(“Error in line: ”+lineNum);   
  50.             for (int i = tmpLen; i < maxLen; i++) {   
  51.                 trainDCNNFile.write((dicMap.size()+1)+“ ”);    
  52.             }   
  53.             trainDCNNFile.write(“\n”);   
  54.             trainlblFileW.write(tmpLen+“\n”);   
  55.             lineNum++;   
  56.         }   
  57.            
  58.         comWordFile.close();   
  59.         srcRawFile.close();   
  60.         trainDCNNFile.close();   
  61.         trainlblFileW.close();   
  62.         System.out.println(“It’s done, OK!”);   
  63.     }   
  64. }  

【Step.6】: 找到对应的词向量。

  1. package Process4DCNN;   
  2. import java.io.BufferedReader;   
  3. import java.io.BufferedWriter;   
  4. import java.io.File;   
  5. import java.io.FileInputStream;   
  6. import java.io.FileOutputStream;   
  7. import java.io.IOException;   
  8. import java.io.InputStreamReader;   
  9. import java.io.OutputStreamWriter;   
  10. import java.util.ArrayList;   
  11. import java.util.HashMap;   
  12.   
  13. public class Step6_FetchWord2Vec {   
  14.     public static void main(String[] args) throws IOException {   
  15.         String datasetStr = “SearchSnippets”;   
  16.         if (args.length==1){   
  17.             datasetStr = args[0];   
  18.         }   
  19.         String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;   
  20.         String word2Vec300FileStr = “./Data/GoogleNews-vectors-negative300.txt”;   
  21.         String word2Vec300FileWStr = “./Data/SearchSnippets/vocab_emb_Word2vec_GoNew300.vec”;   
  22.         String comWordDicindexFileWStr = “./Data/SearchSnippets/vocab_emb_Word2vec_GoNew300_index.dic”;   
  23.   
  24.            
  25.         System.out.println(“Dataset:”+datasetStr);   
  26.         boolean useSennaEmbedding = true;   
  27.         boolean useCWEmbedding = false;   
  28.         boolean useHLBLEmbedding = false;   
  29.         boolean useWord2Vector300 = false;   
  30.         int usedVectorNum = 0;   
  31.         //读取训练语料的词典和3个词向量文件   
  32.         BufferedReader wordmapFile = new BufferedReader(new InputStreamReader(   
  33.                 new FileInputStream(new File(comWordWithIdxPathStr)), “UTF-8″));   
  34.         BufferedReader Senna_embeddings_wordsFile = null;   
  35.         BufferedReader Senna_embeddings_vectorFile = null;   
  36.         BufferedWriter Senna_embeddings_wordsFileW = null;   
  37.         if (useSennaEmbedding){   
  38.         Senna_embeddings_wordsFile = new BufferedReader(new InputStreamReader(   
  39.                 new FileInputStream(new File(“./WordEmbedding/Senna-embeddings-words.lst”)), “UTF-8″));   
  40.         Senna_embeddings_vectorFile = new BufferedReader(new InputStreamReader(   
  41.                 new FileInputStream(new File(“./WordEmbedding/Senna-embeddings.txt”)), “UTF-8″));   
  42.         Senna_embeddings_wordsFileW = new BufferedWriter(new OutputStreamWriter(   
  43.                 new FileOutputStream(new File(“Senna_embeddings_train.vec”)), “UTF-8″));   
  44.         }   
  45.         BufferedReader CWembeddingsFile = null;   
  46.         BufferedWriter CWembeddingsFileW = null;   
  47.         if (useCWEmbedding){   
  48.         CWembeddingsFile = new BufferedReader(new InputStreamReader(   
  49.                 new FileInputStream(new    
  50.                         File(“./WordEmbedding/CWembeddings-scaled.EMBEDDING_SIZE=50.txt”)), “UTF-8″));   
  51.         CWembeddingsFileW = new BufferedWriter(new OutputStreamWriter(   
  52.                 new FileOutputStream(new File(“CW_embeddings.vec”)), “UTF-8″));   
  53.         }   
  54.         BufferedReader HLBLembeddingsFile = null;   
  55.         BufferedWriter HLBLembeddingsFileW = null;   
  56.         if (useHLBLEmbedding) {   
  57.         HLBLembeddingsFile = new BufferedReader(new InputStreamReader(   
  58.                 new FileInputStream(new    
  59.                         File(“./WordEmbedding/HLBL-embeddings-scaled.EMBEDDING_SIZE=50.txt”)), “UTF-8″));   
  60.         HLBLembeddingsFileW = new BufferedWriter(new OutputStreamWriter(   
  61.                 new FileOutputStream(new File(“HLBL_embeddings.vec”)), “UTF-8″));   
  62.         }   
  63.         BufferedReader word2Vec300File = null;   
  64.         BufferedWriter word2Vec300FileW = null;   
  65.         if (useWord2Vector300) {   
  66.             word2Vec300File = new BufferedReader(new InputStreamReader(   
  67.                 new FileInputStream(new    
  68. //                      File(“./GoogleNews-vectors-negative300.txt”)), ”UTF-8″));   
  69. //                      File(“./WordEmbedding/GoVelvectors.6B.50d.txt”)), ”UTF-8″));   
  70. //                      File(“./”+datasetStr+”/vectors_”+datasetStr+”_Word2Vec.bin”)), ”UTF-8″));   
  71.                         File(word2Vec300FileStr)), “UTF-8″));   
  72.             word2Vec300FileW = new BufferedWriter(new OutputStreamWriter(   
  73.                 new FileOutputStream(new File(word2Vec300FileWStr)), “UTF-8″));   
  74.         }          
  75.         //生成公共词典和对应的词向量   
  76.         BufferedWriter comWordDicindexFile = new BufferedWriter(new OutputStreamWriter(   
  77.                 new FileOutputStream(new File(comWordDicindexFileWStr)), “UTF-8″));   
  78.         //保存 三个词向量中都没有的词汇   
  79.            
  80.         HashMap<Integer, ArrayList<String>> vecListMap = new HashMap<Integer, ArrayList<String>>();   
  81.         HashMap<String, Integer> dicIdxMap = new HashMap<String, Integer>();   
  82.         String tempLineStr;   
  83.         //先读取训练集的词典   
  84.         System.out.println(“Start read wordmapFile!”);   
  85.         while ((tempLineStr = wordmapFile.readLine()) != null) {   
  86.             String[] termDict = tempLineStr.split(“\\s+”);    
  87.             dicIdxMap.put(termDict[0].trim(), Integer.valueOf(termDict[1].trim()));// word – index   
  88.             ArrayList<String> tmpVectorList = new ArrayList<String>();   
  89.             vecListMap.put(Integer.valueOf(termDict[1].trim()), tmpVectorList); //index – vectorList   
  90.         }   
  91.         String tmpWordVector;   
  92.         int SennaIdx = 0;   
  93.         int CWIdx = 0;   
  94.         int HLBLIdx = 0;   
  95.         int Word2vec300Idx = 0;        
  96.         int wordIndex = 0;   
  97.         if (useSennaEmbedding) {   
  98.             //核实Senna词向量中的词典   
  99.             System.out.println(“Start read Senna_embeddings_wordsFile!”);   
  100.                
  101.             while ((tempLineStr = Senna_embeddings_wordsFile.readLine()) != null) {   
  102.                 tmpWordVector = Senna_embeddings_vectorFile.readLine();   
  103.                 if (!dicIdxMap.containsKey(tempLineStr.trim())) continue;   
  104.                 ArrayList<String> tmpVectorList = vecListMap.get(dicIdxMap.get(tempLineStr.trim()));   
  105.                 if (tmpVectorList.size()>usedVectorNum) {   
  106.                     System.out.println(“Error in Senna_embeddings_wordsFile:”+tempLineStr);   
  107.                     System.exit(0);   
  108.                 }   
  109.                 tmpVectorList.add(tmpWordVector);   
  110.                 vecListMap.put(dicIdxMap.get(tempLineStr.trim()), tmpVectorList);   
  111.             }   
  112.             SennaIdx = usedVectorNum++;   
  113.         }   
  114. //      if (useCWEmbedding) {   
  115. //          //核实CW词向量中的词典   
  116. //          System.out.println(“Start read CWembeddingsFile!”);   
  117. //          while ((tempLineStr = CWembeddingsFile.readLine()) != null) {   
  118. //              String[] tmpWords = tempLineStr.split(“\\s+”);   
  119. //              tmpWordVector = tempLineStr.substring(tmpWords[0].length());   
  120. //              if (!vecListMap.containsKey(tmpWords[0].trim())) continue;   
  121. //              ArrayList<String> tmpVectorList = vecListMap.get(tmpWords[0].trim());   
  122. //              if (tmpVectorList.size()>usedVectorNum) {   
  123. //                  System.out.println(“Error in CWembeddingsFile:”+tempLineStr);   
  124. //                  System.exit(0);   
  125. //              }   
  126. //              tmpVectorList.add(tmpWordVector);   
  127. //              vecListMap.put(tmpWords[0].trim(), tmpVectorList);   
  128. //          }   
  129. //          CWIdx = usedVectorNum++;   
  130. //      }   
  131. //      if (useHLBLEmbedding) {   
  132. //          //核实HLBL词向量中的词典   
  133. //          System.out.println(“Start read HLBLembeddingsFile!”);   
  134. //          while ((tempLineStr = HLBLembeddingsFile.readLine()) != null) {   
  135. //              String[] tmpWords = tempLineStr.split(“\\s+”);   
  136. //              tmpWordVector = tempLineStr.substring(tmpWords[0].length());   
  137. //              if (!vecListMap.containsKey(tmpWords[0].trim())) continue;   
  138. //              ArrayList<String> tmpVectorList = vecListMap.get(tmpWords[0].trim());   
  139. //              if (tmpVectorList.size()>usedVectorNum) {   
  140. //                  System.out.println(“Error in HLBLembeddingsFile:”+tempLineStr);   
  141. //                  System.exit(0);   
  142. //              }   
  143. //              tmpVectorList.add(tmpWordVector);   
  144. //              vecListMap.put(tmpWords[0].trim(), tmpVectorList);   
  145. //          }   
  146. //          HLBLIdx = usedVectorNum++;   
  147. //      }   
  148.         if (useWord2Vector300) {   
  149.             //核实Word2Vector300词向量中的词典   
  150.             System.out.println(“Start read Word2Vector300File!”);   
  151.             int filterHead = 0;   
  152.             while ((tempLineStr = word2Vec300File.readLine()) != null) {   
  153. //              filterHead++;   
  154. //              if (filterHead<=2) {   
  155. //                  continue;   
  156. //              }   
  157.                 String[] tmpWords = tempLineStr.split(“\\s+”);   
  158.                 tmpWordVector = tempLineStr.substring(tmpWords[0].length());   
  159.                 if (!dicIdxMap.containsKey(tmpWords[0].trim())) continue;   
  160.                 ArrayList<String> tmpVectorList = vecListMap.get(dicIdxMap.get(tmpWords[0].trim()));   
  161.                 if (tmpVectorList.size()>usedVectorNum) {   
  162.                     System.out.println(“Error in Word2Vec300File:”+tempLineStr);   
  163.                     System.exit(0);   
  164.                 }   
  165.                 tmpVectorList.add(tmpWordVector);   
  166.                 vecListMap.put(dicIdxMap.get(tmpWords[0].trim()), tmpVectorList);   
  167.                 wordIndex++;   
  168.             }   
  169.             Word2vec300Idx = usedVectorNum++;   
  170.         }          
  171.         //开始输出词典,索引号和对应的词向量   
  172.         System.out.println(“Start to generate dictionary, index and wordEmbedding!”);   
  173.         String wordStr;   
  174.         int obscIndex = wordIndex+1;   
  175.         wordIndex = 1;   
  176.         for (int i = 1; i <= dicIdxMap.size(); i++) {   
  177.             ArrayList<String> wordVectorList = vecListMap.get(i);   
  178.             if (wordVectorList.size()!=usedVectorNum) {   
  179. //              obscureWordFileW.write(wordStr+”\t”+obscIndex+”\n”);//+ wordVectorList.size()+”\n”);   
  180. //              obscIndex++;   
  181.                 continue;   
  182.             }   
  183.             comWordDicindexFile.write(i+“\n”);//+String.valueOf(wordIndex++)+”\n”);   
  184.             wordIndex++;   
  185.             if (useSennaEmbedding)   
  186.                 Senna_embeddings_wordsFileW.write(wordVectorList.get(SennaIdx)+“\n”);   
  187.             if (useCWEmbedding)   
  188.                 CWembeddingsFileW.write(wordVectorList.get(CWIdx)+“\n”);   
  189.             if (useHLBLEmbedding)   
  190.                 HLBLembeddingsFileW.write(wordVectorList.get(HLBLIdx)+“\n”);   
  191.             if (useWord2Vector300)   
  192.                 word2Vec300FileW.write(wordVectorList.get(Word2vec300Idx)+“\n”);   
  193.         }   
  194.   
  195.         System.out.println(“Info: the size of valid word list is:”+(wordIndex-1));   
  196.         System.out.println(“Info: the size of invalid word list is:”  
  197.                 +(obscIndex-1));   
  198.         wordmapFile.close();   
  199.         if (useSennaEmbedding){   
  200.             Senna_embeddings_wordsFile.close();   
  201.             Senna_embeddings_vectorFile.close();   
  202.             Senna_embeddings_wordsFileW.close();   
  203.         }   
  204.         if (useCWEmbedding) {   
  205.             CWembeddingsFile.close();   
  206.             CWembeddingsFileW.close();     
  207.         }   
  208.         if (useHLBLEmbedding) {   
  209.             HLBLembeddingsFile.close();   
  210.             HLBLembeddingsFileW.close();       
  211.         }   
  212.         if (useWord2Vector300) {   
  213.             word2Vec300File.close();   
  214.             word2Vec300FileW.close();      
  215.         }   
  216.         comWordDicindexFile.close();   
  217.     }   
  218. }   

发表评论

电子邮件地址不会被公开。 必填项已用 * 标注

*

您可以使用这些 HTML 标签和属性: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>