#!/usr/bin/perl -w
#
# To run down one column of a textual data table that uses '|' for separating columns,
#    and substitute the 2-character USPS abbreviations for state names.  Does not handle
#    abbreviations of 3 or more characters yet.  The columns are assumed to be numbered
#    starting with 1, not 0, for designating which column to translate!
#
# Usage: statestocodes.pl  <FILEIN>  <FILEOUT>  <column-to-translate>
#
# Example:  statestocodes.pl pr.file.csv  pr.statecoded.txt 2
#
# David Harris, Version of 1 Jan 2006 21:30 GMT

# ------------------------------------------------
# The table of states and abbreviations:
%statecodes = (
"ALABAMA" => "AL",
"ALASKA" => "AK",
"AMERICAN SAMOA" => "AS",
"ARIZONA" => "AZ",
"ARKANSAS" => "AR",
"CALIFORNIA" => "CA",
"COLORADO" => "CO",
"CONNECTICUT" => "CT",
"DISTRICT OF COLUMBIA" => "DC",
"D.C." => "DC",
"FEDERATED STATES OF MICRONESIA" => "FM",
"FLORIDA" => "FL",
"GEORGIA" => "GA",
"GUAM" => "GU",
"HAWAII" => "HI",
"IDAHO" => "ID",
"ILLINOIS" => "IL",
"INDIANA" => "IN",
"IOWA" => "IA",
"KANSAS" => "KS",
"KENTUCKY" => "KY",
"LOUISIANA" => "LA",
"MAINE" => "ME",
"MARSHALL ISLANDS" => "MH",
"MARYLAND" => "MD",
"MASSACHUSETTS" => "MA",
"MICHIGAN" => "MI",
"MINNESOTA" => "MN",
"MISSISSIPPI" => "MS",
"MISSOURI" => "MO",
"MONTANA" => "MT",
"NEBRASKA" => "NE",
"NEVADA" => "NV",
"NEW HAMPSHIRE" => "NH",
"NEW JERSEY" => "NJ",
"NEW MEXICO" => "NM",
"NEW YORK" => "NY",
"NORTH CAROLINA" => "NC",
"NORTH DAKOTA" => "ND",
"NORTHERN MARIANA ISLANDS" => "MP",
"OHIO" => "OH",
"OKLAHOMA" => "OK",
"OREGON" => "OR",
"PALAU" => "PW",
"PENNSYLVANIA" => "PA",
"PUERTO RICO" => "PR",
"RHODE ISLAND" => "RI",
"SOUTH CAROLINA" => "SC",
"SOUTH DAKOTA" => "SD",
"TENNESSEE" => "TN",
"TEXAS" => "TX",
"UTAH" => "UT",
"VERMONT" => "VT",
"VIRGIN ISLANDS" => "VI",
"VIRGINIA" => "VA",
"WASHINGTON" => "WA",
"WEST VIRGINIA" => "WV",
"WISCONSIN" => "WI",
"WYOMING" => "WY"
);
# Check the number of arguments on the command line:
if ($#ARGV+1 != 3) {
  die "statestocodes.pl needs <FILEIN> <FILEOUT> <target-column-number> as arguments";
};
# Check existence of the textfile and email listing file:
(open FILEIN, "<$ARGV[0]") or die "statestocodes.pl FILEIN file $ARGV[0] not found.\n";
(open FILEOUT, ">$ARGV[1]") or die "statestocodes.pl SQL file could not made.\n";
$targetcolumn = $ARGV[2];
chomp $targetcolumn;
#Read through FILEIN file:
while ($lineread = <FILEIN>) {
  chomp $lineread;
  @allcolumns = split(/\|/, $lineread);
  # Use hash to quickly find substitution and perform it:

  $codelookup = $statecodes{ uc $allcolumns[$targetcolumn-1] } ;
  if (length $codelookup ne 2){
   $codelookup = $allcolumns[$targetcolumn-1] ;
  }
  $allcolumns[$targetcolumn-1] = $codelookup;
  # print FILEOUT "INSERT INTO `$targetcolumn` (`email`) VALUES (\'$lineread\');\n" ;
  for ($i=0; $i < (scalar @allcolumns)-1
       ; $i++) {
    print FILEOUT "$allcolumns[$i]\|" ;
  } ;
  print FILEOUT "$allcolumns[scalar @allcolumns]" ;  # Last column is not followed by '|'
  print FILEOUT "\n" ;
}
close FILEOUT;
close FILEIN;
exit;
