Perl Program Code for freq2

This file contains the actual Perl code of freq2.pl, including all the header comments. freq2 is a program for computing frequency counts from fixed column raw data. It is handy for doing quick checks on text data files.


#!/usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#)  freq2.pl 1.2 99/02/07 @(#)
##  Author:
##      Kent Nassen     knassen@umich.edu
##  Description:
##      A program for grabbing quick and dirty univariate frequencies
##      from an LRECL data file (vars in fixed column locations). Useful
##      if you just want to see what's in a column range without having
##      to fire up SAS or SPSS.
##
##  Usage:
##      freq.pl [-h] [-c[#-#][#] filename]
##   where: -c#-# indicates starting & ending column numbers of the variable
##          and -c# indicates a single-column variable at column #.
##          -h gives a help screen.
##          Only one filename may be given.
##
##  Limitations:
##        - Can only print the first 20 characters of a value,
##          although can compute frequencies for longer values.
##        - The order of values in the output is set by a character
##          sort rather than a numeric sort (the program knows
##          nothing of variable types like numeric or string).
##        - If you give a column range outside the logical record
##          length of the data, you will still get output, but all
##          values will be null.
##
##  v1.1: Added computation of percentages (7/4/1997)
##  v1.2: Checked with -w and 'use strict'.  Improved error handling.
##        Fixed number of lines per page in output. (2/7/1999)
##      
##---------------------------------------------------------------------------##

use FileHandle;
use Getopt::Std;

use vars qw($opt_h $ProgName $opt_c $temp $loc $fname 
            $nofcases $startcol $endcol $cfreq $value 
            $cfreqpct $valuepct $freqpct @data %freq
            $transpos $version $pagesize);

$pagesize=57;
STDOUT->format_lines_per_page($pagesize);

# Check performance
#my $begint = (times) [0];

$version="(v1.2, 2/07/99)";

#$opt_h="";

($ProgName = $0) =~ s%.*/%%;  # Unix
#($ProgName = lc $0) =~ s%.*\\%%;  # DOS

# Usage message subroutine
sub DisplayUsage {
    print STDERR "\n$ProgName: compute frequencies for a single variable in an\n",
      "    LRECL data file. Also computes cumulative frequencies\,\n",
      "    percentages\, and cumulative percentages.\n",
      "    --Kent Nassen $version\n",
      "\nUsage: $ProgName [-h] [-c[#-#][#] filename]\n",
	  "       -c#-#      Start-End column numbers of variable\n",
	  "       -c#        Single column variable at column #\n",
  	  "       filename   LRECL data file filename\n",
	  "       -h         Show this help screen\n",
	  "\n",
	  "       Example: $ProgName -c5-7 fylename.dat\n",
	  "\n       (Column numbers in the file start at column 1)\n\n";
      exit 1;
}

# Get possible options (-c requires a parameter)
getopts (('hc:') or &DisplayUsage); 

# -h option?
if ($opt_h) { &DisplayUsage };

# Parse the -c option for starting and ending column locations
#  (ending location need not be present if the variable is only
#  one column wide, e.g. -c5)
if (!$opt_c) { print STDERR "\n   *** I need column locations!\n"; 
			&DisplayUsage;
			} # if we don't have an opt_c, no need to go further
$_ = $opt_c;
if (/^([0-9]+)-([0-9]+$)/) { # both starting and ending locations: #-#
	$startcol=$1; $endcol=$2;
	}
elsif (/^[0-9]+$/) { # just starting location: #
	$startcol=$opt_c; $endcol=$startcol;
	}
else { print STDERR "\n   *** I couldn't understand your column location(s)!\n"; 
	&DisplayUsage;
	}

# See if the column locations of the variable make sense
if ($endcol==0 || $startcol==0) {
	print STDERR "\n   *** Starting or ending column was zero.\n";
	&DisplayUsage;
	}

# Assume startcol > endcol is a transposition of column locations
# You may want to make this an error instead.
if ($startcol > $endcol) { # swap startcol and endcol
	$temp=$endcol; $endcol=$startcol; $startcol=$temp; 
    print STDERR "\n   *** Start col greater than end col: assuming reversed col. locations.\n";
    }

# Set up a string var for printing out hyphenated column locations 
# (rather than construct this for each page printed)
$loc="$startcol"."-"."$endcol";

# What is left on the command is the filename to be processed (hopefully)
if ($#ARGV>0) { print STDERR "\n   *** One filename at a time, please!\n";
		&DisplayUsage;
		}
if ($#ARGV<0) { print STDERR "\n   *** I need a data filename!\n";
		&DisplayUsage;
		}
open INFILE, "<$ARGV[0]" or die "\n$ProgName:   *** Can't open data file '$ARGV[0]': $!\n\n";

# Set up a string var for printing out a quoted filename 
#  (rather than construct this for each page printed)
$fname="\"".$ARGV[0]."\"";

#### ---------------------------------------------------- ####
#### Read the data from the file named on the commandline ####
#### ---------------------------------------------------- ####
while (<ARGV>) { chop;   
	# Pull the desired columns and save in data array
	push @data, substr($_,$startcol-1,$endcol-$startcol+1); 
	}

#### ---------------------------------------------------- ####
#### The real work begins here                            ####
#### ---------------------------------------------------- ####
# Accumulate Frequencies 
$nofcases=$cfreq=0;
foreach $value (@data) { $freq{$value}++; $nofcases++; }

# Print Out the Frequencies
foreach $value (sort { $a <=> $b } keys(%freq)) {
	$cfreq=$cfreq+$freq{$value}; # cumulative freqs
	if ($nofcases) { $cfreqpct=($cfreq/$nofcases)*100 } # cumulative percent
		else { $cfreqpct="----" } # don't divide by zero
	if ($nofcases) {$freqpct=($freq{$value}/$nofcases)*100;} # value percent
		else { $freqpct="----" }
	write; # this will be formatted by the formats at the end
	}
print "\n";
close INFILE;

# Check performance
#my $endt = (times)[0];
#printf("\n Preceding task required %.2f seconds CPU time\n",$endt-$begint);

# Formatted, paged output
format STDOUT_TOP =
 
   @||||@||
   "Page",$%
   @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<@<<<<<<<<<<<<
   "Frequencies for the values in columns",$loc
   @<<<<<<<<<<<@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
   "in the file ",$fname
 
                                           Cumulative  Cumulative
               Value   Frequency  Percent   Frequency     Percent
              -------  ---------  -------  ----------  ----------
.
format STDOUT =
@>>>>>>>>>>>>>>>>>>> @>>>>>>>>>  @###.##  @>>>>>>>>>     @###.##
$value,$freq{$value},$freqpct,$cfreq,$cfreqpct 
.


Last Modified: Sat Sep 6 14:26:32 EDT 2025

Back to the freq2 Perl Page