package com.dtyunxi.finance.biz.bulelakespider.service.sc;

import com.dtyunxi.finance.biz.bulelakespider.constants.SCConstants;
import com.dtyunxi.finance.biz.bulelakespider.service.IDataWebSpider;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.selector.Selectable;

/* loaded from: input_file:com/dtyunxi/finance/biz/bulelakespider/service/sc/SCDataWebSpider.class */
public class SCDataWebSpider implements IDataWebSpider {
    private static final Logger log = LoggerFactory.getLogger(SCDataWebSpider.class);
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
    private String maxLiter;

    public void process(Page page) {
        Selectable regex = page.getHtml().regex(SCConstants.linkPattern, 1);
        String selectable = page.getUrl().toString();
        getDataForLink(selectable.substring(0, selectable.lastIndexOf("/")) + "/" + regex);
    }

    public void getDataForLink(String str) {
        new SCDataWebSpider();
        Matcher matcher = Pattern.compile(SCConstants.dataPattern).matcher(getInfo(str));
        if (matcher.find()) {
            this.maxLiter = matcher.group(1);
        }
    }

    @Override // com.dtyunxi.finance.biz.bulelakespider.service.IDataWebSpider
    public String getMaxLiter() {
        return this.maxLiter;
    }

    public static void main(String[] strArr) {
        SCDataWebSpider sCDataWebSpider = new SCDataWebSpider();
        Request request = new Request("http://fgw.sc.gov.cn/sfgw/c106088/2022/6/28/4156af2ea298475a81aae0f93bc04e74.shtml");
        request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36");
        Spider.create(sCDataWebSpider).addRequest(new Request[]{request}).thread(1).run();
        System.out.println(sCDataWebSpider.maxLiter);
    }

    public static String getInfo(String str) {
        URL url = null;
        try {
            url = new URL(str);
        } catch (MalformedURLException e) {
            e.printStackTrace();
        }
        URLConnection uRLConnection = null;
        try {
            uRLConnection = url.openConnection();
        } catch (IOException e2) {
            e2.printStackTrace();
        }
        String str2 = "";
        try {
            if (str.endsWith(".doc")) {
                WordExtractor wordExtractor = new WordExtractor(new FileInputStream(str));
                str2 = wordExtractor.getText();
                System.out.println(wordExtractor.getText());
                System.out.println("=================1=================");
                System.out.println("==================2================");
                System.out.println("页脚：" + wordExtractor.getDocument());
                System.out.println(wordExtractor.getMetadataTextExtractor().getText());
                System.out.println("===============5===================");
                String[] paragraphText = wordExtractor.getParagraphText();
                for (int i = 0; i < paragraphText.length; i++) {
                    System.out.println("Paragraph " + (i + 1) + " : " + paragraphText[i]);
                }
                System.out.println(wordExtractor.getTextFromPieces());
                System.out.println("=============6=====================");
                System.out.println(wordExtractor.getMetadataTextExtractor());
                System.out.println("===============7===================");
                System.out.println(wordExtractor.getEndnoteText());
                System.out.println("===============8===================");
                wordExtractor.close();
            } else if (str.endsWith(".docx")) {
                XWPFWordExtractor xWPFWordExtractor = new XWPFWordExtractor(OPCPackage.open(uRLConnection.getInputStream()));
                str2 = xWPFWordExtractor.getText();
                xWPFWordExtractor.close();
            } else {
                System.out.println("此文件不是word文件");
            }
        } catch (Exception e3) {
            e3.printStackTrace();
        }
        return str2;
    }

    public Site getSite() {
        return this.site;
    }
}
