dblp.xml到.arff的转换

dblp.xml到.arff的转换

Posted by dym on January 20, 2014

本文迁移自老博客,原始链接为 https://seven.blog.ustc.edu.cn/dblp-xml%e5%88%b0-arff%e7%9a%84%e8%bd%ac%e6%8d%a2/

将dblp.xml文件中的数据转化成为weka可用的arff格式的文件。由于一开始对weka的文件格式不熟悉,绕了很多弯路,所以值得把方法记下来。 使用dom4j 包中的saxReader 逐条解析xml,读出作者名,过滤掉只出现一次的作者记录,过滤掉名字中出现非法符号的作者名字。然后用哈希表给名字编号,同时用文件流将拼接好的字符串写入.arff文件,文件采用标称属性类型,并用稀疏矩阵有效的压缩了空间,最终文件的属性为119万条,实例数约为150万条,文件大小为97M。写成的文件格式如下例所示: @relation name @attribute ‘E. F. Codd’ {F , T} @attribute ‘Patrick A. V. Hall’ {F , T} @attribute ‘Markus Tresch’ {F , T} @attribute ‘C. J. Date’ {F , T} @attribute ‘Michael Ley’ {F , T} @attribute ‘Werner John’ {F , T} @attribute ‘Dominik Ley’ {F , T} @attribute ‘Markus Casper’ {F , T} @attribute ‘Hugo Hellebrand’ {F , T} @attribute ‘Ralf Merz’ {F , T} @attribute ‘John Lions’ {F , T} … @data {0 T, 3 T } {5 T, 6 T } {7 T, 8 T, 9 T, 10 T } {7 T, 8 T, 12 T, 13 T, 14 T, 15 T } {18 T, 19 T } {18 T, 19 T, 20 T } {23 T, 24 T } {25 T, 27 T } {28 T, 29 T, 30 T } … 采用的预处理代码自动完成了从xml文件的读取,边读边清理内存,过滤非法名字和无用信息(如只出现一次的作者),作者的编号,文件流的格式化输出等所有的功能,需要对执行完后的两个文件(一个属性文件和一个数据文件)拷贝到一个文件中,并补全头部格式并删除掉不完整记录,不再赘述,主要代码如下:

class newElementHandler implements ElementHandler{  
	Integer num = 0;
	Integer nump = 0;
	boolean flag = false;
	FileWriter writer1 , writer2;
	BufferedWriter bw ,bw2;
	Hashtable<String , Integer> h = new Hashtable<String , Integer>();
	String str , str1;
	List<Integer> l = new ArrayList<Integer>();
	public newElementHandler() throws IOException
	{
		writer1 = new FileWriter("D:/11.arff",true);
		bw=new BufferedWriter(writer1);
		writer2 = new FileWriter("D:/22.txt",true);
		bw2 = new BufferedWriter(writer2);
	}
	int no = 0;
	int tmp = -1;
	public void onEnd(ElementPath ep) {  
         Element e = ep.getCurrent(); 
         if(!e.getName().equals("author") && !e.getName().equals("title") )
        	e.detach();
         else	
         {
        	if(e.getName().equals("title"))
        	{
        		if(num > 1 )
       		{
        			Collections.sort(l);
        			try {
				bw.write("{");
				} catch (IOException e2) {
					// TODO Auto-generated catch block
					e2.printStackTrace();
			}
        			tmp = -1;
        			Iterator<Integer> itr = l.iterator();
        			while (itr.hasNext()) {
        				Integer i = itr.next();
        				try {
        					if(i != tmp){
						bw.write(Integer.toString(i));
						if(no < num - 1)
							bw.write("  T, ");
						else
							bw.write("  T ");
        					}
        					tmp = i;
        					no++;
				} catch (IOException e1) {
				// TODO Auto-generated catch block
					e1.printStackTrace();
			          	}
        			}
        			try {
				bw.write("}");
				bw.write('\n');
			} catch (IOException e1) {
			// TODO Auto-generated catch block
			         e1.printStackTrace();
		          	}
       			}
		num = 0;
		no = 0;
		l.clear();
		flag = false;
       		e.detach();
        	}
        	else if(e.getName().equals("author") && num < 8) 
        	{
        		str = e.getText();
        		if(!str.matches("[A-Za-z_. ]*"))
        		{
              		e.detach();
        		}
        		else
        		{	
		        	if(h.get(str) == null)
			{	
				h.put(str, nump);
				l.add(nump);
				try {
					bw2.write("@attribute  '");
					bw2.write(str);
					bw2.write("'   {F , T}");
					bw2.write('\n');
				} catch (IOException e1) {
					// TODO Auto-generated catch block
					e1.printStackTrace();
				}
				nump++;
				num++;
			}
		        	else
		       	{
		        	         l.add(h.get(str));
		       		num++;   				
		       	}
		       	e.detach();
         	}
       	 }
        }
    }
    public void onStart(ElementPath arg0) {  
    }
}
public class ca {
	public static void main(String[] args) throws IOException, DocumentException {
		System.setProperty("entityExpansionLimit", "10000000");
		String filepath =  "D:/dblp.xml";
		SAXReader reader = new SAXReader();
		org.dom4j.Document doc ;
		newElementHandler hd = new newElementHandler();
		reader.setDefaultHandler(hd);  
		doc = reader.read(new BufferedInputStream(new FileInputStream(new File(filepath))));
	}
}