本文迁移自老博客,原始链接为 https://seven.blog.ustc.edu.cn/dblp-xml%e5%88%b0-arff%e7%9a%84%e8%bd%ac%e6%8d%a2/
将dblp.xml文件中的数据转化成为weka可用的arff格式的文件。由于一开始对weka的文件格式不熟悉,绕了很多弯路,所以值得把方法记下来。 使用dom4j 包中的saxReader 逐条解析xml,读出作者名,过滤掉只出现一次的作者记录,过滤掉名字中出现非法符号的作者名字。然后用哈希表给名字编号,同时用文件流将拼接好的字符串写入.arff文件,文件采用标称属性类型,并用稀疏矩阵有效的压缩了空间,最终文件的属性为119万条,实例数约为150万条,文件大小为97M。写成的文件格式如下例所示: @relation name @attribute ‘E. F. Codd’ {F , T} @attribute ‘Patrick A. V. Hall’ {F , T} @attribute ‘Markus Tresch’ {F , T} @attribute ‘C. J. Date’ {F , T} @attribute ‘Michael Ley’ {F , T} @attribute ‘Werner John’ {F , T} @attribute ‘Dominik Ley’ {F , T} @attribute ‘Markus Casper’ {F , T} @attribute ‘Hugo Hellebrand’ {F , T} @attribute ‘Ralf Merz’ {F , T} @attribute ‘John Lions’ {F , T} … @data {0 T, 3 T } {5 T, 6 T } {7 T, 8 T, 9 T, 10 T } {7 T, 8 T, 12 T, 13 T, 14 T, 15 T } {18 T, 19 T } {18 T, 19 T, 20 T } {23 T, 24 T } {25 T, 27 T } {28 T, 29 T, 30 T } … 采用的预处理代码自动完成了从xml文件的读取,边读边清理内存,过滤非法名字和无用信息(如只出现一次的作者),作者的编号,文件流的格式化输出等所有的功能,需要对执行完后的两个文件(一个属性文件和一个数据文件)拷贝到一个文件中,并补全头部格式并删除掉不完整记录,不再赘述,主要代码如下:
class newElementHandler implements ElementHandler{
Integer num = 0;
Integer nump = 0;
boolean flag = false;
FileWriter writer1 , writer2;
BufferedWriter bw ,bw2;
Hashtable<String , Integer> h = new Hashtable<String , Integer>();
String str , str1;
List<Integer> l = new ArrayList<Integer>();
public newElementHandler() throws IOException
{
writer1 = new FileWriter("D:/11.arff",true);
bw=new BufferedWriter(writer1);
writer2 = new FileWriter("D:/22.txt",true);
bw2 = new BufferedWriter(writer2);
}
int no = 0;
int tmp = -1;
public void onEnd(ElementPath ep) {
Element e = ep.getCurrent();
if(!e.getName().equals("author") && !e.getName().equals("title") )
e.detach();
else
{
if(e.getName().equals("title"))
{
if(num > 1 )
{
Collections.sort(l);
try {
bw.write("{");
} catch (IOException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
tmp = -1;
Iterator<Integer> itr = l.iterator();
while (itr.hasNext()) {
Integer i = itr.next();
try {
if(i != tmp){
bw.write(Integer.toString(i));
if(no < num - 1)
bw.write(" T, ");
else
bw.write(" T ");
}
tmp = i;
no++;
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
try {
bw.write("}");
bw.write('\n');
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
num = 0;
no = 0;
l.clear();
flag = false;
e.detach();
}
else if(e.getName().equals("author") && num < 8)
{
str = e.getText();
if(!str.matches("[A-Za-z_. ]*"))
{
e.detach();
}
else
{
if(h.get(str) == null)
{
h.put(str, nump);
l.add(nump);
try {
bw2.write("@attribute '");
bw2.write(str);
bw2.write("' {F , T}");
bw2.write('\n');
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
nump++;
num++;
}
else
{
l.add(h.get(str));
num++;
}
e.detach();
}
}
}
}
public void onStart(ElementPath arg0) {
}
}
public class ca {
public static void main(String[] args) throws IOException, DocumentException {
System.setProperty("entityExpansionLimit", "10000000");
String filepath = "D:/dblp.xml";
SAXReader reader = new SAXReader();
org.dom4j.Document doc ;
newElementHandler hd = new newElementHandler();
reader.setDefaultHandler(hd);
doc = reader.read(new BufferedInputStream(new FileInputStream(new File(filepath))));
}
}