Package gov.nysenate.openleg.scripts.admin

Source Code of gov.nysenate.openleg.scripts.admin.SpotCheck

package gov.nysenate.openleg.scripts.admin;

import gov.nysenate.openleg.model.Action;
import gov.nysenate.openleg.model.Bill;
import gov.nysenate.openleg.model.Person;
import gov.nysenate.openleg.model.admin.Report;
import gov.nysenate.openleg.model.admin.ReportObservation;
import gov.nysenate.openleg.model.admin.SpotCheckBill;
import gov.nysenate.openleg.scripts.BaseScript;
import gov.nysenate.openleg.util.Application;
import gov.nysenate.openleg.util.Storage;

import gov.nysenate.openleg.util.TextFormatter;
import gov.nysenate.util.Config;

import java.io.File;
import java.io.IOException;
import java.sql.SQLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanHandler;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.text.WordUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.log4j.Logger;

public class SpotCheck extends BaseScript
{
    public static Logger logger = Logger.getLogger(SpotCheck.class);

    public static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMdd");
    private String SESSION_YEAR = "2013";
    public Pattern spotcheckBillId = Pattern.compile("([A-Z])(\\d+)([A-Z]?)");

    public static void main(String[] args) throws Exception
    {
        new SpotCheck().run(args);
    }

    public String unescapeHTML(String text)
    {
        return StringEscapeUtils.unescapeHtml4(text).replace("'", "'");
    }

    public boolean stringEquals(String a, String b, boolean ignoreCase, boolean normalizeSpaces)
    {
        if (normalizeSpaces) {
            a = a.replaceAll("\\s+", " ");
            b = b.replaceAll("\\s+", " ");
        }

        return (ignoreCase) ? a.equalsIgnoreCase(b) : a.equals(b);
    }

    public void execute(CommandLine opts) throws IOException, ParseException, SQLException
    {
        QueryRunner runner = new QueryRunner(Application.getDB().getDataSource());

        String[] args = opts.getArgs();
        Storage storage = Application.getStorage();

        Config config = Application.getConfig();

        List<ReportObservation> observations = new ArrayList<ReportObservation>();
        HashMap<String, Integer> errorTotals = new HashMap<String, Integer>();
        for (String error_type : new String[] {"title", "summary", "sponsor", "cosponsors", "events", "pages", "amendments"}) {
            errorTotals.put(error_type, 0);
        }

        String prefix = args[0];
        Date date = dateFormat.parse(prefix);
        logger.info("Processing daybreak files for: "+date);
        File directory = new File(config.getValue("checkmail.lrsFileDir"));
        HashMap<String, SpotCheckBill> bills = new HashMap<String, SpotCheckBill>();
        logger.info("Reading " + prefix+".senate.low.html");
        bills.putAll(readDaybreak(new File(directory, prefix+".senate.low.html")));
        logger.info("Reading " + prefix+".senate.high.html");
        bills.putAll(readDaybreak(new File(directory, prefix+".senate.high.html")));
        logger.info("Reading " + prefix+".assembly.low.html");
        bills.putAll(readDaybreak(new File(directory, prefix+".assembly.low.html")));
        logger.info("Reading " + prefix+".assembly.high.html");
        bills.putAll(readDaybreak(new File(directory, prefix+".assembly.high.html")));
        logger.info("Reading " + prefix+".page_file.txt");
        loadPageFile(new File(directory, prefix+".page_file.txt"), bills);

        SpotCheckBill testbill = bills.get("S1743A");

        runner.update("insert ignore into report(time) values(?)", date);
        Report report = runner.query("select * from report where time = ?", new BeanHandler<Report>(Report.class), date);
        runner.update("delete from report_observation where reportId = ?", report.getId());

        for(String id : bills.keySet()) {
            //logger.info("checking bill "+id);
            String billNo = id+"-2013";
            Bill jsonBill = (Bill)storage.get("2013/bill/"+billNo, Bill.class);

            if (jsonBill == null) {
                logger.error("Missing bill "+"2013/bill/"+billNo);
                continue;
            }

            if (!jsonBill.isPublished()) {
                logger.error("Bill Unpublished: "+billNo);
                continue;
            }

            // Compare the titles, ignore white space differences
            String jsonTitle = unescapeHTML(jsonBill.getTitle());
            String lbdcTitle = bills.get(id).getTitle();
            if (!lbdcTitle.isEmpty() && !stringEquals(jsonTitle, lbdcTitle, true, true)) {
                // What is this D?
                if (!id.startsWith("D")) {
                    logger.error("Title: "+billNo);
                    logger.error("  LBDC: "+lbdcTitle);
                    logger.error("  JSON: "+jsonTitle);
                    observations.add(new ReportObservation(report.getId(), billNo, "BILL_TITLE", lbdcTitle, jsonTitle));
                    errorTotals.put("title", errorTotals.get("title")+1);
                }
            }

            // Compare the summaries. LBDC reports summary and law changes together
            String jsonLaw = jsonBill.getLaw();
            String jsonSummary = unescapeHTML(jsonBill.getSummary());
            String lbdcSummary = bills.get(id).getSummary().replaceAll("\\s+", " ");

            if( jsonLaw != null && jsonLaw != "" && jsonLaw != "null") {
                jsonSummary = unescapeHTML(jsonLaw)+" "+jsonSummary;
            }

            if (lbdcSummary.equals("BILL SUMMARY NOT FOUND")) {
                lbdcSummary = "";
            }

            jsonSummary = jsonSummary.replace('§', 'S').replace('¶', 'P');
            if (!lbdcSummary.isEmpty() && !jsonSummary.replace(" ","").equals(lbdcSummary.replace(" ", "")) ) {
                if (!id.startsWith("D")) {
                    logger.error("Summary: "+billNo);
                    logger.error("  LBDC: "+lbdcSummary);
                    logger.error("  JSON: "+jsonSummary);
                    observations.add(new ReportObservation(report.getId(), billNo, "BILL_SUMMARY", lbdcSummary, jsonSummary));
                    errorTotals.put("summary", errorTotals.get("summary")+1);
                }
            }

            String jsonSponsor = "";
            if (jsonBill.getSponsor() != null) {
                jsonSponsor = unescapeHTML(jsonBill.getSponsor().getFullname()).toUpperCase().replace(" (MS)","").replace("BILL", "").replace("COM", "");
            }
            String lbdcSponsor = bills.get(id).getSponsor().toUpperCase().replace("BILL", "").replace("COM", "");
            if (lbdcSponsor.startsWith("RULES ") && lbdcSponsor.contains("(") && !lbdcSponsor.contains(")")){
                lbdcSponsor = lbdcSponsor.concat(")");
            }
            if (!lbdcSponsor.isEmpty() && !jsonSponsor.replace(" ","").equals(lbdcSponsor.replace(" ", "")) ) {
                if (!id.startsWith("D")) {
                    logger.error("Sponsor: "+billNo);
                    logger.error("  LBDC: "+lbdcSponsor);
                    logger.error("  JSON: "+jsonSponsor);
                    observations.add(new ReportObservation(report.getId(), billNo, "BILL_SPONSOR", lbdcSponsor, jsonSponsor));
                    errorTotals.put("sponsor", errorTotals.get("sponsor")+1);
                }
            }


            TreeSet<String> lbdcCosponsors = new TreeSet<String>();
            TreeSet<String> jsonCosponsors = new TreeSet<String>();
            if ( jsonBill.getCoSponsors() != null ) {
                List<Person> cosponsors = jsonBill.getCoSponsors();
                for(Person cosponsor : cosponsors) {
                    // store all names as Capitalized without () for comparison
                    jsonCosponsors.add(WordUtils.capitalizeFully(cosponsor.getFullname().replaceAll("[\\(\\)]+", "")));
                }
            }
            // Capitalize and remove () for lbdc too
            for(String cosponsor : bills.get(id).getCosponsors()){
                lbdcCosponsors.add(WordUtils.capitalizeFully(cosponsor.replaceAll("[\\(\\)]+", "")));
            }


            if (!lbdcCosponsors.isEmpty() && (lbdcCosponsors.size() != jsonCosponsors.size() || (!lbdcCosponsors.isEmpty() && !lbdcCosponsors.containsAll(jsonCosponsors))) ) {
                if (!id.startsWith("D")) {
                    logger.error("Cosponsors: "+billNo);
                    logger.error("  LBDC: "+lbdcCosponsors);
                    logger.error("  JSON: "+jsonCosponsors);
                    observations.add(new ReportObservation(report.getId(), billNo, "BILL_COSPONSOR", StringUtils.join(lbdcCosponsors, " "), StringUtils.join(jsonCosponsors, " ")));
                    errorTotals.put("cosponsors", errorTotals.get("cosponsors")+1);
                }
            }

            ArrayList<String> lbdcEvents = bills.get(id).getActions();
            ArrayList<String> jsonEvents = new ArrayList<String>();
            SimpleDateFormat dateFormat = new SimpleDateFormat("MM/dd/yy");

            for (Action action : jsonBill.getActions()) {
                jsonEvents.add(dateFormat.format(action.getDate())+" "+action.getText());
            }

            if (!lbdcEvents.isEmpty() &&  (lbdcEvents.size() != jsonEvents.size() || (!lbdcEvents.isEmpty() && !lbdcEvents.containsAll(jsonEvents))) ) {
                boolean substituted = StringUtils.join(jsonEvents, " ").toLowerCase().contains(" substituted ");
                boolean delivered =  !jsonEvents.isEmpty() ? jsonEvents.get(jsonEvents.size()-1).toLowerCase().contains(" delivered to ") : false;
                if (!id.startsWith("D") && !substituted && !delivered) {
                    logger.error("Events: "+billNo);
                    logger.error("  LBDC: "+lbdcEvents);
                    logger.error("  JSON: "+jsonEvents);
                    observations.add(new ReportObservation(report.getId(), billNo, "BILL_ACTION", StringUtils.join(lbdcEvents,"\n"), StringUtils.join(jsonEvents,"\n")));
                    errorTotals.put("events", errorTotals.get("events")+1);
                }
            }

            int lbdcPages = bills.get(id).pages;
            int jsonPages = TextFormatter.pdfPrintablePages(jsonBill).size();
            if (jsonBill.getFulltext().equals("")) {
                jsonPages = 0;
            }

            if (jsonPages != lbdcPages) {
                logger.error("Pages: "+billNo);
                logger.error("  LBDC: "+lbdcPages);
                logger.error("  JSON: "+jsonPages);
                observations.add(new ReportObservation(report.getId(), billNo, "BILL_TEXT_PAGE", String.valueOf(lbdcPages), String.valueOf(jsonPages)));
                errorTotals.put("pages", errorTotals.get("pages")+1);
            }

            ArrayList<String> lbdcAmendments = bills.get(id).getAmendments();
            ArrayList<String> jsonAmendments = (ArrayList) jsonBill.getAmendments();

            // Bill.getAmendments() does not include Bill itself, SpotCheckBill.getAmendments() does.
            jsonAmendments.add(jsonBill.getBillId());

            if (!amendmentsEqual(lbdcAmendments, jsonAmendments)) {
                logger.error("Amendments: " + billNo);
                logger.error("This Bill amendment : " + jsonBill.getBillId());
                logger.error("  LBDC: " + lbdcAmendments);
                logger.error("  JSON: " + jsonAmendments);
                observations.add(new ReportObservation(report.getId(), billNo, "BILL_AMENDMENT", StringUtils.join(lbdcAmendments, "\n"), StringUtils.join(jsonAmendments, "\n")));
                errorTotals.put("amendments", errorTotals.get("amendments") + 1);
            }
            //logger.info("Bill "+id+" checked");
        }

        logger.info("Bills checked, writing observations to db...");

        for (ReportObservation observation : observations) {
            //logger.info("inserting observation  "+observation.getOid() + " " + observation.getField() + " " + observation.getActualValue() + " " + observation.getObservedValue());
            runner.update(
                    "INSERT INTO report_observation (reportId, oid, field, actualValue, observedValue) VALUES (?, ?, ?, ?, ?)",
                    observation.getReportId(),
                    observation.getOid(),
                    observation.getField(),
                    observation.getActualValue(),
                    observation.getObservedValue()
            );
            //logger.info("observation " + observation.getOid() + " inserted");
        }

        System.out.println(errorTotals);
        System.out.println(bills.keySet().size());
        //System.exit(0);

        int total = 0;
        for(SpotCheckBill bill : bills.values()) {
            total += 1+bill.amendments.size();
        }
        System.out.println("Estimated Total: "+total);

        System.out.println(bills.size());

    }

    private boolean amendmentsEqual(ArrayList<String> lbdcAmendments, ArrayList<String> jsonAmendments) {
        if (lbdcAmendments.size() == 0) {
            return true;
        }
        Collections.sort(lbdcAmendments);
        Collections.sort(jsonAmendments);
        return lbdcAmendments.equals(jsonAmendments);
    }

    public void loadPageFile(File dataFile, HashMap<String, SpotCheckBill> bills) throws IOException
    {
        List<String> entries = FileUtils.readLines(dataFile, "latin1");
        entries.remove(0); // Remove the header line
        System.out.println(entries.size());
        for(String entry : entries) {
            String[] parts = entry.split(",");
            String sen_id = (parts[1]+parts[2].replaceAll("^0*", "")+parts[3]).trim();
            String asm_id = (parts[4]+parts[5].replaceAll("^0*", "")+parts[6]).trim();
            int pages = Integer.parseInt(parts[8]);

            if(!sen_id.isEmpty()) {
                updateBillFromPageFile(bills, sen_id, pages);
            }

            if(!asm_id.isEmpty()) {
                updateBillFromPageFile(bills, asm_id, pages);
            }

            if (!sen_id.isEmpty() && !asm_id.isEmpty()) {
                bills.get(sen_id).sameas = asm_id;
                bills.get(asm_id).sameas = sen_id;
            }
        }
    }

    private void updateBillFromPageFile(HashMap<String, SpotCheckBill> bills, String bill_id, int pages){
        if (bills.containsKey(bill_id)) {
            bills.get(bill_id).pages = pages;
        }
        else {
            // Look for
            //logger.error("Unknown bill '"+asm_id+"'");
            SpotCheckBill bill = new SpotCheckBill();
            bill.id = bill_id;
            bill.pages = pages;
            bills.put(bill_id, bill);
        }
    }

    private void updateAmendmentsFromPageFile(HashMap<String, SpotCheckBill> bills, String billId, Matcher billMatcher){
        String baseBill = billMatcher.group(1).trim() + billMatcher.group(2).trim();
        char amendment = billMatcher.group(3).charAt(0);
        if(bills.containsKey(baseBill) &&
                bills.get(baseBill).isCurrentAmendment() && !bills.get(baseBill).amendments.contains(billId)){
            bills.get(baseBill).amendments.add(billId);
        }
        for (int i = amendment-1; i >= 'A'; i--) {
            String checkedBillId = baseBill + (char)i;
            if(bills.containsKey(checkedBillId) &&
                        bills.get(checkedBillId).isCurrentAmendment() &&
                        !bills.get(checkedBillId).amendments.contains(billId) ){
                    bills.get(checkedBillId).amendments.add(billId);
            }
        }
    }

    private HashMap<String, SpotCheckBill> readDaybreak(File daybreak) throws IOException {
        String html = FileUtils.readFileToString(daybreak, "latin1").replaceAll("\\r?\\n", " ");

        Pattern billDataPattern = Pattern.compile("<tr.*?>(.+?)</tr>");
        Pattern stripHtmlTags = Pattern.compile("<b>(.*?)</b>|<(a|/a|td).*?>|<br>\\s*Criminal Sanction Impact.");

        Matcher billDataMatcher = billDataPattern.matcher(html);
        billDataMatcher.find(); // Throw the first two rows away
        billDataMatcher.find(); // They are just headers for the table

        HashMap<String, SpotCheckBill> bills = new HashMap<String, SpotCheckBill>();
        while(billDataMatcher.find()) {
            String rawBillHtml = billDataMatcher.group(1);
            String billParts[] = stripHtmlTags.matcher(rawBillHtml).replaceAll("").replace("</td>", "<br>").split("<br>");

            SpotCheckBill bill = extractBill(billParts);
            bills.put(bill.id, bill);
        }

        return bills;
    }

    private SpotCheckBill extractBill(String[] billParts) {
        SpotCheckBill bill = new SpotCheckBill();

        bill.setCurrentAmendment(true);
        bill.id = billParts[0].trim();
        bill.setTitle(billParts[2].trim());
        bill.law = billParts[3].trim();
        bill.setSummary(billParts[4].trim());
        bill.setActions(extractBillActions(billParts));
        bill.setSponsor(extractSponsor(billParts[1]));
        bill.setAmendments(extractAmendments(bill));

        if (isSenateBill(bill)) {
            bill.setCosponsors(extractSenateCosponsors(billParts[1]));
        } else { // is assembly bill
            bill.setCosponsors(extractAssemblyCosponsors(billParts[1]));
            bill.setMultisponsors(extractMultisponsors(billParts[1]));
        }

        return bill;
    }

    private ArrayList<String> extractBillActions(String[] billParts) {
        ArrayList<String> actions = new ArrayList<String>();
        for (int i = 5; i < billParts.length; i++) {
            String action = billParts[i].trim();
            try {
                SimpleDateFormat actionDateFormat = new SimpleDateFormat("MM/dd/yy");
                // Valid actions must start with date
                actionDateFormat.parse(action.split(" ")[0]);
                actions.add(action);
            } catch (ParseException e) {
                // Skip this action
            }

        }
        return actions;
    }

    // Used in old code for parsing all types of assembly sponsors:
    // parts[1] = parts[1].replaceAll("([A-Z])\\.[¦ ]([A-Z'-]+)", "$2 $1");

    /**
     * Extract the sponsor from a text field containing sponsor, cosponsors, and multisponsors.
     * <pre>Examples:
     * Senate: "DIAZ CO: ESPAILLAT, SAMPSON". returns DIAZ.
     * Assembly: "MONTESANO, GOODELL, NOJAY; M-S: Barclay, Hawley, Oaks, Tenney, Thiele". returns MONTESANO.
     * Assembly: "RULES COM (Request of Farrell, Wright)". returns itself unchanged.
     * </pre>
     * @param billPart string containing all sponsor, cosponsor, and multisponsor information.
     * @return sponsor of bill.
     */
    private String extractSponsor(String billPart) {
        String sponsor = billPart.split("CO:|; M-S:")[0].split(",")[0].trim();
        return formatInitials(sponsor);
    }

    private ArrayList<String> extractAmendments(SpotCheckBill bill) {
        ArrayList<String> amendments = new ArrayList<String>();

        Matcher idMatcher = spotcheckBillId.matcher(bill.id);
        if(anAmendmentExists(idMatcher)) {
            String billNo = idMatcher.group(2).trim();
            String billId = idMatcher.group(1).trim() + billNo;
            char amendment = idMatcher.group(3).charAt(0);
            for (int i = amendment-1; i >= 'A'; i--) {
                amendments.add(billId + String.valueOf((char)i) + "-" + SESSION_YEAR);
            }
            // Also add the base bill
            amendments.add(billId + "-" + SESSION_YEAR);
        }

        // Add bill itself as an amendment.
        amendments.add(bill.id + "-" + SESSION_YEAR);

        return amendments;
    }

    private boolean anAmendmentExists(Matcher idMatcher) {
        return idMatcher.find() && !idMatcher.group(3).isEmpty();
    }

    /**
     * In senate html files, 'CO:' separates sponsor from cosponsors.
     * i.e. PERALTA CO: ADDABBO, ESPAILLAT
     */
    private ArrayList<String> extractSenateCosponsors(String billPart) {
        ArrayList<String> cosponsors = new ArrayList<String>();

        String[] split = billPart.split("CO:");
        if (split.length >= 2) {
            for (String cosponsor : split[1].split(",")) {
                cosponsor = cosponsor.trim();
                cosponsor = formatInitials(cosponsor);
                cosponsors.add(cosponsor);
            }
        }
        return cosponsors;
    }

    /**
     * In assembly html files, '; M-S:' separates sponsor and cosponsors from multisponsors
     * i.e. KAVANAGH, PEOPLES-STOKES, JAFFEE, ROBINSON; M-S: Arroyo, Brook-Krasny, Gottfried, Lifton
     */
    private ArrayList<String> extractAssemblyCosponsors(String billPart) {
        ArrayList<String> cosponsors = new ArrayList<String>();

        String[] sponsorAndCosponsors = billPart.split("; M-S:")[0].split(",");
        // The first element is the sponsor, skip him.
        for (int i = 1; i < sponsorAndCosponsors.length; i++) {
            String cosponsor = sponsorAndCosponsors[i].trim();
            cosponsor = formatInitials(cosponsor);
            cosponsors.add(cosponsor);
        }
        return cosponsors;
    }

    private ArrayList<String> extractMultisponsors(String billPart) {
        ArrayList<String> multisponsors = new ArrayList<String>();

        String[] split = billPart.split("; M-S:");
        if (split.length >= 2) {
            for (String multisponsor : split[1].split(",")) {
                multisponsor = multisponsor.trim();
                multisponsor = formatInitials(multisponsor);
                multisponsors.add(multisponsor);
            }
        }
        return multisponsors;
    }

    /**
     * Changes a name to its uniquely identifying value if not already in that form.
     * i.e. change 'P. Lopez' to 'LOPEZ P'
     * @param name Name to format if in incorrect format.
     * @return Name formatted to match the uniquely identifying name found in sobi files.
     */
    private String formatInitials(String name) {
        String[] parts = name.split("[^\\x00-\\x7F]+"); // Split on non ascii characters.
        if (parts.length > 1) {
            String firstInitial = parts[0].replace(".", "");
            String lastName = parts[1];
            name = lastName + " " + firstInitial;
            name = name.toUpperCase();
        }

        return name;
    }

    private boolean isSenateBill(SpotCheckBill bill) {
        return bill.id.startsWith("S");
    }
}
TOP

Related Classes of gov.nysenate.openleg.scripts.admin.SpotCheck

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.