#!/usr/bin/perl -Tw #----------------------------------------------------------------------------- my $self_name = 'guess-recordcount.pl'; # # Purpose: Assuming uniform record length, guess a file's record count # Author: Royce Williams / @TychoTithonus # Created: 2020-11-03 # Version: $Id$ #----------------------------------------------------------------------------- # Note that if the file is of mixed record type, this will be notably wrong. #----------------------------------------------------------------------------- use strict; use warnings; use POSIX; use Data::Dumper; $Data::Dumper::Sortkeys = 1; my %length_results; # Levels: 0-1 my $debug = 0; #my $debug = 1; my $default_max_records = 1000; #--------------------------------------------------------------------- sub show_usage { print "\n$self_name - guess record count from sample and file size\n\n"; print "Usage: $self_name [file] [optional record count]\n"; print "\n"; print "Current default record count: $default_max_records\n"; print "If the file is less than the sample size, exact count is used.\n"; print "\n"; return; } #--------------------------------------------------------------------- # Find key of largest value in a hash. # Credit: https://stackoverflow.com/questions/2886872 sub key_of_largest_value (\%$) { my $hash = shift; my $query_type = shift || 'key'; # Start with the first key and value as candidate biggest. my ($key, @keys) = keys(%$hash); my ($big, @vals) = values(%$hash); # FIXME - need tie-breaker: last one wins. # Otherwise, the winner is chosen semi-randomly. for (0 .. $#keys) { if ($vals[$_] > $big) { $big = $vals[$_]; $key = $keys[$_]; } } if ($query_type eq 'key') { return($key); } if ($query_type eq 'value') { return($big); } return($key); } #----------------------------------------------------------------------------- # Process cmdline. if (! @ARGV) { show_usage(); exit 2; } my $file = shift(@ARGV); my $max_records = shift(@ARGV) || $default_max_records; if (! -r $file) { print "- Error: file isn't readable\n"; exit 1 } if (-z $file) { print STDERR "- Warning: file is empty\n"; print "0\n"; exit 0; } #----------------------------------------------------------------------------- # Get file size and open it. my $file_size = -s $file; ($debug > 0) && print STDERR ('- File ' . $file . ' - size ' . $file_size . "\n"); open(FILE, $file) || die "Can't $!"; #----------------------------------------------------------------------------- # Get length of first $max_records records my $record_count = 0; my $total_bytes = 0; while (my $record = ) { $record_count++; my $record_length = length($record); ($debug > 0) && print STDERR ('- Record: ' . $record_count . ' - length ' . $record_length . "\n"); # Accumulate record_length frequency. $length_results{$record_length}++; # Accumulate total bytes. $total_bytes += $record_length; # Only read the first N records. last if $. == $max_records; } if ($record_count < ($max_records)) { print STDERR "- Actual record count $record_count of $file is under sample size $max_records - using exact count\n"; print "$record_count\n"; exit 0 } #----------------------------------------------------------------------------- # Get consensus - only if greater than zero and all are equal. my $best_record_length = key_of_largest_value(%length_results, 'key') || 0; my $best_record_length_freq = $length_results{$best_record_length} || 0; ($debug > 0) && print STDERR "best_record_length: $best_record_length; best_record_length_freq: $best_record_length_freq\n"; if ($best_record_length gt 0) { # Change this to be a percentage check - say, 90%? Configurable? if ($record_count != $best_record_length_freq) { print STDERR "- WARNING: only $best_record_length_freq/$record_count records have $best_record_length bytes.\n"; #print STDERR Dumper $length_results{$best_record_length}; ($debug > 0) && print STDERR Dumper %length_results; } else { ($debug > 0) && print STDERR "- All $record_count records have length ${best_record_length}s.\n"; } } # Estimate record count from file size. # Add a byte for linefeed, and .5 to automatically round. my $guessed_record_count = int( ($file_size / ($best_record_length)) + 0.5); #print "my average_record_length = int( ($file_size / ($total_bytes / $max_records)) + 0.5)\n"; my $average_record_length = int( ($file_size / ($total_bytes / $max_records)) + 0.5);; ($debug > 0) && print STDERR "- Estimates based on - mean: $average_record_length; mode: $guessed_record_count\n"; print "$average_record_length\n"; #print "$guessed_record_count\n"; #----------------------------------------------------------------------------- #($debug > 0) && print STDERR Dumper %length_results; #-----------------------------------------------------------------------------