DCNN-2014的数据格式:
数据变量名 | 大小 | 说明 | 完成度 |
index | 53714个词 (comWord.dic) | 字符串词典,尽量保证训练/测试集及词向量中都有 | OK |
sent_length | 40 (实际最大长度38) | 文本最大长度 | OK |
size_vocab | 53714+1 | 字典的大小+1个空白词 | OK |
test | 2280*sent_length | 测试集数据 | OK |
test_lbl | 可以没有64位Hash标签 | {可无64位Hash标签}{文本长度},由Spectral训练 | |
train | 10060*sent_length | 训练集数据 | OK |
train_lbl | 要64个类别标签 | {64位Hash标签}{文本长度},由Spectral训练 | |
vocab_emb_1_CW | 50*词典维数 | 从外部拿到的词向量,覆盖的词向量太少 | 暂不考虑 |
vocab_emb_2_HLBL | 50*词典维数 | 如果叠加的话,覆盖的词向量太少 | 暂不考虑 |
vocab_emb_3_Senna | 50*词典维数 | 从外部拿到的词向量, 考虑是否除以/std(A) | OK |
Format:【SearchSnippets数据集介绍】:[下载地址], [备用下载地址]
Training and test data as Google search snippets
Traning data: — File: train.txt — Size: 10,060 snippets
Test data: — File: test.txt — Size: 2,280 snippets
Each snippet on each line
Each snippet consists of a list of words/terms plus a class label at the end
8 domains (class labels):
Business, Computers, Culture-Arts-Entertainment, Education-Science, Engineering, Health, Politics-Society, Sports
【Step1. 生成词典】:比较方便的Trick是采用 把train.txt和test.txt的内容贴到一个txt中,然后跑一下LDA,会自动生成wordmap.txt文件. [LDA相关地址], [备用LDA工具]
LDA启动指令格式:java -jar jgibblda.jar -est -alpha 0.5 -beta 0.1 -ntopics 400 -dir ./ -dfile train.txt 详见[http://jacoxu.com/?p=648]
但是SearchSnippets的数据是从Google的结果内容中摘的,文本中有很多噪音。我们可以从相关的Wiki中生成一个比较好的词典。SearchSnippets的作者提供了 [对应的Wiki语料], [备选下载地址]。
不过自己写一个抽词典的程序可能用起来会方便一点,这里一定要注意编码格式的问题,尽量使用(UTF-8和Unix):(注意,test464行最后有乱码问题windows vista鈩?computers,一种方式是把手动把乱码去掉,改为windows vista computers,一种是置之不管,然后标记标签时注意一下。仅此一次影响不大,为了以后简便统一,我们这里按照第二种方式保留)
- package Process4DCNN;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.util.HashSet;
- import java.util.Iterator;
- public class Step1_GenerateDict4SearchSnippets {
- public static void main(String[] args) throws Exception {
- //读取SearchSnippets数据集
- String searchSnippetPathStr = “./Data/SearchSnippets/all-test+train.txt”;
- String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;
- BufferedReader corpusFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(searchSnippetPathStr)), “UTF-8″));
- BufferedWriter wordMapFileW = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(comWordWithIdxPathStr)), “UTF-8″));
- String tempLine;
- //开始读取数据
- int wordReadNum = 0;
- int wordWriteNum = 0;
- HashSet<String> wordSet = new HashSet<String>();
- System.out.println(“Start to read wordSet …”);
- while ((tempLine = corpusFile.readLine()) != null) {
- String[] wordArraysStr = tempLine.split(“\\s+”);
- //最后一个为Word为Label
- for (int i = 0; i < wordArraysStr.length-1; i++) {
- String tmpWord = wordArraysStr[i].trim();
- if (wordSet.contains(tmpWord)) continue;
- else {
- wordSet.add(tmpWord);
- wordReadNum++;
- }
- }
- }
- System.out.println(“Start to output wordSet …”);
- Iterator<String> iterator=wordSet.iterator();
- while(iterator.hasNext()){
- wordWriteNum++;
- wordMapFileW.write(iterator.next()+“\t”+wordWriteNum+“\n”);
- }
- System.out.println(“wordReadNum:” +wordReadNum);
- System.out.println(“wordWriteNum:” +wordWriteNum);
- if (wordWriteNum!=wordReadNum) {
- System.out.println(“Error! wordWriteNum is diffent with wordReadNum”);
- }
- corpusFile.close();
- wordMapFileW.close();
- }
- }
【Step2. 词典过滤文本】
[输入]:词典:comWord.dic,输入文本:train.txt
[输出]:过滤后的文本 train_refined.txt, 若过滤后无内容则记录无内容行号 及 过滤后的最大文本长度maxLen 输入到train__refined_Info.lst文件中。相应代码如下:
- package Process4DCNN;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.util.HashSet;
- public class Step2_RefineDocs4SearchSnippets {
- public static void main(String[] args) throws IOException {
- //读取词典和要进行过滤的文本
- String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;
- String searchSnippetPathStr = “./Data/SearchSnippets/all-test+train.txt”;
- BufferedReader comWordFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(comWordWithIdxPathStr)), “UTF-8″));
- BufferedReader srcRawFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(searchSnippetPathStr)), “UTF-8″));
- HashSet<String> dicSet = new HashSet<String>();
- //输出过滤后的文本 及 过滤后无内容的行号
- String allRefinedFileStr = “./Data/SearchSnippets/all_refined.txt”;
- String invalidDocFileWStr = “./Data/SearchSnippets/all_refined_Info.lst”;
- BufferedWriter trainRefinedFile = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(allRefinedFileStr)), “UTF-8″));
- BufferedWriter invalidDocFileW = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(invalidDocFileWStr)), “UTF-8″));
- //读取词典
- String tempLineStr;
- System.out.println(“Start to load the dictionary!”);
- while ((tempLineStr = comWordFile.readLine()) != null) {
- String[] tmpWords = tempLineStr.split(“\t”);
- dicSet.add(tmpWords[0].trim());
- }
- //利用词典进行文本过滤,同时记录文本的最大长度maxLen
- System.out.println(“Start to refine the raw documents!”);
- int maxLen = 0;
- int lineNum = 1;
- while ((tempLineStr = srcRawFile.readLine()) != null) {
- int tmpLen = 0;
- String[] tmpWords = tempLineStr.split(“\\s+”);
- //由于SearchSnippet数据的最后一项是标签文本,因而直接扔掉
- for (int i = 0; i < tmpWords.length-1; i++) {
- String tmpWord = tmpWords[i];
- //如果词典里面有,则留下,如果没有则放弃
- if (dicSet.contains(tmpWord.trim())) {
- trainRefinedFile.write(tmpWord.trim()+“ ”);
- tmpLen++;
- }else {
- System.out.println(“Error in line:”+lineNum+“, and the word is:”+tmpWord.trim());
- }
- }
- if (tmpLen==0) {
- //如果过滤后没有内容了,则标记好行号
- invalidDocFileW.write(lineNum+“\n”);
- trainRefinedFile.write(“ \n”); //暂时保留了空白行
- }else {
- if (tmpLen > maxLen) maxLen = tmpLen;
- // if (tmpLen>200) {
- // invalidDocFileW.write(“Overflowed:”+ lineNum+”, with length:”+ tmpLen+”\n”);
- // }
- trainRefinedFile.write(“\n”);
- }
- lineNum++;
- }
- System.out.println(“Info: maxLen is ”+maxLen);
- invalidDocFileW.write(“Info: maxLen is ”+maxLen);
- comWordFile.close();
- srcRawFile.close();
- trainRefinedFile.close();
- invalidDocFileW.close();
- }
- }
【Step3. 生成数据集的标签文件】:
输入:原始文本。输出:每条text对应的标签序号。
- package Process4DCNN;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.util.HashMap;
- public class Step3_CreateIndx4SearchSnippets {
- public static void main(String[] args) throws IOException {
- HashMap<String, Integer> tagMap = new HashMap<String, Integer>();
- String searchSnippetPathStr = “./Data/SearchSnippets/all-test+train.txt”;
- BufferedReader srcRowFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(searchSnippetPathStr)), “UTF-8″));
- //输出过滤后的文本 及 过滤后无内容的行号
- String allIndxFileStr = “./Data/SearchSnippets/all_indx.txt”;
- BufferedWriter trainIndxFile = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(allIndxFileStr)), “UTF-8″));
- tagMap.put(“business”, 1);
- tagMap.put(“computers”, 2);
- tagMap.put(“culture-arts-entertainment”, 3);
- tagMap.put(“education-science”, 4);
- tagMap.put(“engineering”, 5);
- tagMap.put(“health”, 6);
- tagMap.put(“politics-society”, 7);
- tagMap.put(“sports”, 8);
- //读取词典
- String tempLineStr;
- System.out.println(“Start to refine the raw documents!”);
- int lineNum = 1;
- while ((tempLineStr = srcRowFile.readLine()) != null) {
- String[] tmpWords = tempLineStr.split(“\\s+”);
- //由于SearchSnippet数据的最后一项是标签文本
- if (tagMap.containsKey(tmpWords[tmpWords.length-1].trim())) {
- trainIndxFile.write(tagMap.get(tmpWords[tmpWords.length-1].trim())+“\n”);
- }else {
- System.out.println(“Error in line:”+lineNum+“, and the tag is:”+tmpWords[0].trim());
- //这里特殊处理一下
- trainIndxFile.write(tagMap.get(“computers”)+“\n”);
- }
- lineNum++;
- }
- System.out.println(“OK!”);
- trainIndxFile.close();
- srcRowFile.close();
- }
- }
【Step4. 生成向量空间模型VSM】:
输入:过滤掉标签的文本,词典。输出:VSM
- package Process4DCNN;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.util.ArrayList;
- import java.util.HashMap;
- public class Step4_Process4STHFormat4SearchSnippets {
- public static void main(String[] args) throws Exception {
- //Test
- // String tempLine =”宸濄伄娴併倢銇倛銇?28521″;
- // String[] tokensStr = tempLine.split(“\t”);
- //利用纯文本和wordmap构建基于词频的向量空间模型,用于STH预处理
- //all = [test_data;train_data]!
- String allRefinedFileStr = “./Data/SearchSnippets/all_refined.txt”;
- String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;
- String vsmTextStr = “./Data/SearchSnippets/vsmOfall.dic”;
- BufferedReader sourceTextRD = new BufferedReader(
- new InputStreamReader(new FileInputStream(new File(allRefinedFileStr)), “UTF-8″));
- BufferedReader wordMapRD = new BufferedReader(
- new InputStreamReader(new FileInputStream(new File(comWordWithIdxPathStr)), “UTF-8″));
- BufferedWriter vsmBW = new BufferedWriter(
- new OutputStreamWriter(new FileOutputStream(new File(vsmTextStr)), “UTF-8″));
- //构造VSM词频向量空间模型
- creatVSMText(sourceTextRD,wordMapRD,vsmBW);
- sourceTextRD.close();
- wordMapRD.close();
- vsmBW.close();
- System.out.println(“It is done, ok!”);
- }
- public static void creatVSMText(BufferedReader sourceTextRD,
- BufferedReader wordMapRD, BufferedWriter vsmBW) throws IOException, Exception {
- System.out.println(“Start to create VSM …!”);
- String tempLine;
- //先读入词典
- int wordIdxNum = 0;
- HashMap<String, Integer> wordMap = new HashMap<String,Integer>();
- while ((tempLine = wordMapRD.readLine()) != null) {
- //词典中放着词和索引号,索引号从1-30642
- if (wordMap.containsKey(tempLine.trim())) {
- System.out.println(“Test, the word is replicate:”+tempLine.trim());
- }
- if (tempLine.trim().length()==0) continue;
- //wordMap.put(tempLine.trim(), wordIdxNum);
- wordMap.put(tempLine.split(“\\s+”)[0].trim(), Integer.valueOf(tempLine.split(“\\s+”)[1]));
- wordIdxNum++;
- }
- //定义了这个数据集的特征维数,注意具有数据集独立化
- int dimVector = wordIdxNum;
- System.out.println(“Has read the dictionary, the size is:”+wordMap.size());
- ArrayList<Integer> wordFreqList = new ArrayList<Integer>();
- int lineNum = 1;
- boolean hasWordFeature = false;
- while ((tempLine = sourceTextRD.readLine()) != null) {
- //因为数据比较多时,VSM会异常的大,我们分批处理!
- // if (!(150001<=lineNum)) { //【设置了下限
- // lineNum++;
- // continue;
- // }
- // if (!(lineNum<=155000)) { //设置了上限 】
- // break;
- // }
- hasWordFeature = false;
- //读入一行,即一个文档;
- wordFreqList.clear();
- for (int i = 0; i < dimVector; i++) {
- wordFreqList.add(0);
- }
- // if (lineNum == 464) {
- // System.out.println(“Test”);
- // }
- String[] tokensStr = tempLine.split(“\\s+”);
- ///for (String tempToken: tokensStr) {
- for (int j = 0; j < tokensStr.length; j++) {
- String tempToken = tokensStr[j];
- if (wordMap.containsKey(tempToken.trim())) {
- hasWordFeature = true;
- int index = wordMap.get(tempToken.trim());
- if (index>dimVector) {
- System.out.print(“Error, and the word is: ”+tempToken.trim());
- }
- wordFreqList.set(index-1, wordFreqList.get(index-1)+1);
- }else {
- if (lineNum > 7532) {
- System.out.println(“Warning:–”);
- }
- System.out.println(“error: the map has not contain the word:”
- +tempToken+“ in Line:”+lineNum);
- }
- }
- for (int tempFreq:wordFreqList) {
- vsmBW.write(String.valueOf(tempFreq)+“ ”);
- }
- vsmBW.write(“\n”);
- if (!hasWordFeature) {
- System.out.println(“++++++++++”+“has no word in Line:”+lineNum+“++++++++++”);
- }
- lineNum++;
- }
- }
- }
【Step 5. 生成DCNN格式的训练语料和测试语料】:
输入:训练文本train_refined.txt, 测试文本 test_refined.txt,公共词典, comWord.dic,固定最大长度40
输出:按照词典的Index进行标注,不够最大长度的用size(comWord)+1 填充。生成train_DCNN.txt, test_DCNN.txt,并输出对应的文本长度到train_lbl.txt和test_lbl.txt文件中:
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.util.HashMap;
- public class Transform2DCNNFormat {
- public static void main(String[] args) throws IOException {
- //读取词典和语料文本
- BufferedReader comWordFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(“./SearchSnippets/comWord.dic”)), “UTF-8″));
- BufferedReader srcRawFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(“./SearchSnippets/test_refined.txt”)), “UTF-8″));
- HashMap<String, Integer> dicMap = new HashMap<String, Integer>();
- //输出DCNN格式的语料文本和文本长度
- BufferedWriter trainDCNNFile = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(“test_DCNN.txt”)), “UTF-8″));
- BufferedWriter trainlblFileW = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(“test_lbl.txt”)), “UTF-8″));
- int maxLen = 40;
- //读取词典
- String tempLineStr;
- System.out.println(“Start to load the dictionary!”);
- while ((tempLineStr = comWordFile.readLine()) != null) {
- String[] tmpWords = tempLineStr.split(“\\s+”);
- dicMap.put(tmpWords[0].trim(), Integer.valueOf(tmpWords[1]));
- }
- //利用词典进行文本转化,同时记录文本的长度Len
- System.out.println(“Start to transform the raw documents!”);
- int lineNum = 1;
- while ((tempLineStr = srcRawFile.readLine()) != null) {
- int tmpLen = 0;
- String[] tmpWords = tempLineStr.split(“\\s+”);
- for (int i = 0; i < tmpWords.length; i++) {
- if (dicMap.containsKey(tmpWords[i].trim())) {
- trainDCNNFile.write(dicMap.get(tmpWords[i].trim())+“ ”);
- }else {
- System.out.println(“Error in line: ”+lineNum);
- }
- tmpLen++;
- }
- if (tmpLen>=maxLen) System.out.println(“Error in line: ”+lineNum);
- for (int i = tmpLen; i < maxLen; i++) {
- trainDCNNFile.write((dicMap.size()+1)+“ ”);
- }
- trainDCNNFile.write(“\n”);
- trainlblFileW.write(tmpLen+“\n”);
- lineNum++;
- }
- comWordFile.close();
- srcRawFile.close();
- trainDCNNFile.close();
- trainlblFileW.close();
- System.out.println(“It’s done, OK!”);
- }
- }
【Step.6】: 找到对应的词向量。
- package Process4DCNN;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.util.ArrayList;
- import java.util.HashMap;
- public class Step6_FetchWord2Vec {
- public static void main(String[] args) throws IOException {
- String datasetStr = “SearchSnippets”;
- if (args.length==1){
- datasetStr = args[0];
- }
- String comWordWithIdxPathStr = “./Data/SearchSnippets/comWordWithIdx_train.dic”;
- String word2Vec300FileStr = “./Data/GoogleNews-vectors-negative300.txt”;
- String word2Vec300FileWStr = “./Data/SearchSnippets/vocab_emb_Word2vec_GoNew300.vec”;
- String comWordDicindexFileWStr = “./Data/SearchSnippets/vocab_emb_Word2vec_GoNew300_index.dic”;
- System.out.println(“Dataset:”+datasetStr);
- boolean useSennaEmbedding = true;
- boolean useCWEmbedding = false;
- boolean useHLBLEmbedding = false;
- boolean useWord2Vector300 = false;
- int usedVectorNum = 0;
- //读取训练语料的词典和3个词向量文件
- BufferedReader wordmapFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(comWordWithIdxPathStr)), “UTF-8″));
- BufferedReader Senna_embeddings_wordsFile = null;
- BufferedReader Senna_embeddings_vectorFile = null;
- BufferedWriter Senna_embeddings_wordsFileW = null;
- if (useSennaEmbedding){
- Senna_embeddings_wordsFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(“./WordEmbedding/Senna-embeddings-words.lst”)), “UTF-8″));
- Senna_embeddings_vectorFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(“./WordEmbedding/Senna-embeddings.txt”)), “UTF-8″));
- Senna_embeddings_wordsFileW = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(“Senna_embeddings_train.vec”)), “UTF-8″));
- }
- BufferedReader CWembeddingsFile = null;
- BufferedWriter CWembeddingsFileW = null;
- if (useCWEmbedding){
- CWembeddingsFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new
- File(“./WordEmbedding/CWembeddings-scaled.EMBEDDING_SIZE=50.txt”)), “UTF-8″));
- CWembeddingsFileW = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(“CW_embeddings.vec”)), “UTF-8″));
- }
- BufferedReader HLBLembeddingsFile = null;
- BufferedWriter HLBLembeddingsFileW = null;
- if (useHLBLEmbedding) {
- HLBLembeddingsFile = new BufferedReader(new InputStreamReader(
- new FileInputStream(new
- File(“./WordEmbedding/HLBL-embeddings-scaled.EMBEDDING_SIZE=50.txt”)), “UTF-8″));
- HLBLembeddingsFileW = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(“HLBL_embeddings.vec”)), “UTF-8″));
- }
- BufferedReader word2Vec300File = null;
- BufferedWriter word2Vec300FileW = null;
- if (useWord2Vector300) {
- word2Vec300File = new BufferedReader(new InputStreamReader(
- new FileInputStream(new
- // File(“./GoogleNews-vectors-negative300.txt”)), ”UTF-8″));
- // File(“./WordEmbedding/GoVelvectors.6B.50d.txt”)), ”UTF-8″));
- // File(“./”+datasetStr+”/vectors_”+datasetStr+”_Word2Vec.bin”)), ”UTF-8″));
- File(word2Vec300FileStr)), “UTF-8″));
- word2Vec300FileW = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(word2Vec300FileWStr)), “UTF-8″));
- }
- //生成公共词典和对应的词向量
- BufferedWriter comWordDicindexFile = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(new File(comWordDicindexFileWStr)), “UTF-8″));
- //保存 三个词向量中都没有的词汇
- HashMap<Integer, ArrayList<String>> vecListMap = new HashMap<Integer, ArrayList<String>>();
- HashMap<String, Integer> dicIdxMap = new HashMap<String, Integer>();
- String tempLineStr;
- //先读取训练集的词典
- System.out.println(“Start read wordmapFile!”);
- while ((tempLineStr = wordmapFile.readLine()) != null) {
- String[] termDict = tempLineStr.split(“\\s+”);
- dicIdxMap.put(termDict[0].trim(), Integer.valueOf(termDict[1].trim()));// word – index
- ArrayList<String> tmpVectorList = new ArrayList<String>();
- vecListMap.put(Integer.valueOf(termDict[1].trim()), tmpVectorList); //index – vectorList
- }
- String tmpWordVector;
- int SennaIdx = 0;
- int CWIdx = 0;
- int HLBLIdx = 0;
- int Word2vec300Idx = 0;
- int wordIndex = 0;
- if (useSennaEmbedding) {
- //核实Senna词向量中的词典
- System.out.println(“Start read Senna_embeddings_wordsFile!”);
- while ((tempLineStr = Senna_embeddings_wordsFile.readLine()) != null) {
- tmpWordVector = Senna_embeddings_vectorFile.readLine();
- if (!dicIdxMap.containsKey(tempLineStr.trim())) continue;
- ArrayList<String> tmpVectorList = vecListMap.get(dicIdxMap.get(tempLineStr.trim()));
- if (tmpVectorList.size()>usedVectorNum) {
- System.out.println(“Error in Senna_embeddings_wordsFile:”+tempLineStr);
- System.exit(0);
- }
- tmpVectorList.add(tmpWordVector);
- vecListMap.put(dicIdxMap.get(tempLineStr.trim()), tmpVectorList);
- }
- SennaIdx = usedVectorNum++;
- }
- // if (useCWEmbedding) {
- // //核实CW词向量中的词典
- // System.out.println(“Start read CWembeddingsFile!”);
- // while ((tempLineStr = CWembeddingsFile.readLine()) != null) {
- // String[] tmpWords = tempLineStr.split(“\\s+”);
- // tmpWordVector = tempLineStr.substring(tmpWords[0].length());
- // if (!vecListMap.containsKey(tmpWords[0].trim())) continue;
- // ArrayList<String> tmpVectorList = vecListMap.get(tmpWords[0].trim());
- // if (tmpVectorList.size()>usedVectorNum) {
- // System.out.println(“Error in CWembeddingsFile:”+tempLineStr);
- // System.exit(0);
- // }
- // tmpVectorList.add(tmpWordVector);
- // vecListMap.put(tmpWords[0].trim(), tmpVectorList);
- // }
- // CWIdx = usedVectorNum++;
- // }
- // if (useHLBLEmbedding) {
- // //核实HLBL词向量中的词典
- // System.out.println(“Start read HLBLembeddingsFile!”);
- // while ((tempLineStr = HLBLembeddingsFile.readLine()) != null) {
- // String[] tmpWords = tempLineStr.split(“\\s+”);
- // tmpWordVector = tempLineStr.substring(tmpWords[0].length());
- // if (!vecListMap.containsKey(tmpWords[0].trim())) continue;
- // ArrayList<String> tmpVectorList = vecListMap.get(tmpWords[0].trim());
- // if (tmpVectorList.size()>usedVectorNum) {
- // System.out.println(“Error in HLBLembeddingsFile:”+tempLineStr);
- // System.exit(0);
- // }
- // tmpVectorList.add(tmpWordVector);
- // vecListMap.put(tmpWords[0].trim(), tmpVectorList);
- // }
- // HLBLIdx = usedVectorNum++;
- // }
- if (useWord2Vector300) {
- //核实Word2Vector300词向量中的词典
- System.out.println(“Start read Word2Vector300File!”);
- int filterHead = 0;
- while ((tempLineStr = word2Vec300File.readLine()) != null) {
- // filterHead++;
- // if (filterHead<=2) {
- // continue;
- // }
- String[] tmpWords = tempLineStr.split(“\\s+”);
- tmpWordVector = tempLineStr.substring(tmpWords[0].length());
- if (!dicIdxMap.containsKey(tmpWords[0].trim())) continue;
- ArrayList<String> tmpVectorList = vecListMap.get(dicIdxMap.get(tmpWords[0].trim()));
- if (tmpVectorList.size()>usedVectorNum) {
- System.out.println(“Error in Word2Vec300File:”+tempLineStr);
- System.exit(0);
- }
- tmpVectorList.add(tmpWordVector);
- vecListMap.put(dicIdxMap.get(tmpWords[0].trim()), tmpVectorList);
- wordIndex++;
- }
- Word2vec300Idx = usedVectorNum++;
- }
- //开始输出词典,索引号和对应的词向量
- System.out.println(“Start to generate dictionary, index and wordEmbedding!”);
- String wordStr;
- int obscIndex = wordIndex+1;
- wordIndex = 1;
- for (int i = 1; i <= dicIdxMap.size(); i++) {
- ArrayList<String> wordVectorList = vecListMap.get(i);
- if (wordVectorList.size()!=usedVectorNum) {
- // obscureWordFileW.write(wordStr+”\t”+obscIndex+”\n”);//+ wordVectorList.size()+”\n”);
- // obscIndex++;
- continue;
- }
- comWordDicindexFile.write(i+“\n”);//+String.valueOf(wordIndex++)+”\n”);
- wordIndex++;
- if (useSennaEmbedding)
- Senna_embeddings_wordsFileW.write(wordVectorList.get(SennaIdx)+“\n”);
- if (useCWEmbedding)
- CWembeddingsFileW.write(wordVectorList.get(CWIdx)+“\n”);
- if (useHLBLEmbedding)
- HLBLembeddingsFileW.write(wordVectorList.get(HLBLIdx)+“\n”);
- if (useWord2Vector300)
- word2Vec300FileW.write(wordVectorList.get(Word2vec300Idx)+“\n”);
- }
- System.out.println(“Info: the size of valid word list is:”+(wordIndex-1));
- System.out.println(“Info: the size of invalid word list is:”
- +(obscIndex-1));
- wordmapFile.close();
- if (useSennaEmbedding){
- Senna_embeddings_wordsFile.close();
- Senna_embeddings_vectorFile.close();
- Senna_embeddings_wordsFileW.close();
- }
- if (useCWEmbedding) {
- CWembeddingsFile.close();
- CWembeddingsFileW.close();
- }
- if (useHLBLEmbedding) {
- HLBLembeddingsFile.close();
- HLBLembeddingsFileW.close();
- }
- if (useWord2Vector300) {
- word2Vec300File.close();
- word2Vec300FileW.close();
- }
- comWordDicindexFile.close();
- }
- }