package org.fnlp.nlp.similarity.train;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import org.fnlp.data.reader.Reader;
import org.fnlp.ml.types.Instance;
import org.fnlp.nlp.cn.ChineseTrans;

/* loaded from: input_file:org/fnlp/nlp/similarity/train/SougouCA.class */
public class SougouCA extends Reader {
    private static ChineseTrans tc = new ChineseTrans();
    File file;
    BufferedReader reader;
    String url = null;
    String docno = null;
    String contenttitle = null;
    String content = null;

    public SougouCA(String str) {
        this.file = null;
        this.reader = null;
        this.file = new File(str);
        if (!this.file.exists()) {
            this.file = null;
            this.reader = null;
            return;
        }
        try {
            this.reader = new BufferedReader(new InputStreamReader(new FileInputStream(this.file), "UTF8"));
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        }
    }

    @Override // java.util.Iterator
    public boolean hasNext() {
        if (this.reader == null) {
            return false;
        }
        try {
            String readLine = this.reader.readLine();
            if (readLine == null) {
                return false;
            }
            if (readLine.equals("<doc>")) {
                this.url = this.reader.readLine().replaceAll("^<url>", "");
                this.url = this.url.replaceAll("</url>$", "");
                this.docno = this.reader.readLine().replaceAll("^<docno>", "");
                this.docno = this.docno.replaceAll("</docno>$", "");
                this.contenttitle = this.reader.readLine().replaceAll("^<contenttitle>", "");
                this.contenttitle = this.contenttitle.replaceAll("</contenttitle>$", "");
                String readLine2 = this.reader.readLine();
                this.content = readLine2;
                while (!readLine2.endsWith("</content>")) {
                    readLine2 = this.reader.readLine();
                    this.content += readLine2;
                }
                this.content = this.content.replaceAll("^<content>", "");
                this.content = this.content.replaceAll("</content>$", "");
            }
            this.reader.readLine();
            return true;
        } catch (IOException e) {
            e.printStackTrace();
            return false;
        }
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // java.util.Iterator
    public Instance next() {
        return new Instance(this.content);
    }

    public static void main(String[] strArr) throws IOException {
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("./tmpdata/trad.txt"), "UTF-8"));
        SougouCA sougouCA = new SougouCA("./tmpdata/SogouCa/news.allsites.010805.txt");
        while (sougouCA.hasNext()) {
            String normalize = tc.normalize((String) sougouCA.next().getData());
            if (normalize.length() != 0) {
                bufferedWriter.write(normalize);
                bufferedWriter.write("\n");
            }
        }
        bufferedWriter.close();
        System.out.println("Done!");
    }
}
