#!/usr/bin/perl -Tw
#-----------------------------------------------------------------------------
my $self_name = 'guess-recordcount.pl';
#
# Purpose: Assuming uniform record length, guess a file's record count
# Author:  Royce Williams / @TychoTithonus
# Created: 2020-11-03
# Version: $Id$
#-----------------------------------------------------------------------------
# Note that if the file is of mixed record type, this will be notably wrong.
#-----------------------------------------------------------------------------

use strict;
use warnings;

use POSIX;

use Data::Dumper;
$Data::Dumper::Sortkeys = 1;

my %length_results;

# Levels: 0-1
my $debug = 0;
#my $debug = 1;

my $default_max_records = 1000;

#---------------------------------------------------------------------

sub show_usage {

    print "\n$self_name - guess record count from sample and file size\n\n";
    print "Usage: $self_name [file] [optional record count]\n";
    print "\n";
    print "Current default record count: $default_max_records\n";
    print "If the file is less than the sample size, exact count is used.\n";
    print "\n";

    return;

}

#---------------------------------------------------------------------
# Find key of largest value in a hash.
# Credit: https://stackoverflow.com/questions/2886872
sub key_of_largest_value (\%$) {

    my $hash = shift;
    my $query_type = shift || 'key';

    # Start with the first key and value as candidate biggest.
    my ($key, @keys) = keys(%$hash);
    my ($big, @vals) = values(%$hash);

    # FIXME - need tie-breaker: last one wins.
    # Otherwise, the winner is chosen semi-randomly.

    for (0 .. $#keys) {
        if ($vals[$_] > $big) {
            $big = $vals[$_];
            $key = $keys[$_];
        }
    }

    if ($query_type eq 'key') {
        return($key);
    }

    if ($query_type eq 'value') {
        return($big);
    }

    return($key);

}

#-----------------------------------------------------------------------------
# Process cmdline.

if (! @ARGV) {
    show_usage();
    exit 2;
}

my $file = shift(@ARGV);
my $max_records = shift(@ARGV) || $default_max_records;

if (! -r $file) {
	print "- Error: file isn't readable\n";
	exit 1
}

if (-z $file) {
    print STDERR "- Warning: file is empty\n";
    print "0\n";
    exit 0;
}

#-----------------------------------------------------------------------------
# Get file size and open it.

my $file_size = -s $file;
($debug > 0) && print STDERR ('- File ' . $file . ' - size ' . $file_size . "\n");


open(FILE, $file) || die "Can't $!";

#-----------------------------------------------------------------------------

# Get length of first $max_records records
my $record_count = 0;
my $total_bytes = 0;

while (my $record = <FILE>) {

	$record_count++;

    my $record_length = length($record);

	($debug > 0) && print STDERR ('- Record: ' . $record_count . ' - length ' . $record_length . "\n");

    # Accumulate record_length frequency.
    $length_results{$record_length}++;

    # Accumulate total bytes.
    $total_bytes += $record_length;

    # Only read the first N records.
    last if $. == $max_records;

}

if ($record_count < ($max_records)) {
    print STDERR "- Actual record count $record_count of $file is under sample size $max_records - using exact count\n";
    print "$record_count\n";
    exit 0
}

#-----------------------------------------------------------------------------
# Get consensus - only if greater than zero and all are equal.
my $best_record_length = key_of_largest_value(%length_results, 'key') || 0;

my $best_record_length_freq  = $length_results{$best_record_length} || 0;

($debug > 0) && print STDERR "best_record_length: $best_record_length; best_record_length_freq: $best_record_length_freq\n";

if ($best_record_length gt 0) {

    # Change this to be a percentage check - say, 90%? Configurable?
    if ($record_count != $best_record_length_freq) {
        print STDERR "- WARNING: only $best_record_length_freq/$record_count records have $best_record_length bytes.\n";
        #print STDERR Dumper $length_results{$best_record_length};
        ($debug > 0) && print STDERR Dumper %length_results;
    } else {
        ($debug > 0) && print STDERR "- All $record_count records have length ${best_record_length}s.\n";
    }

}

# Estimate record count from file size.
# Add a byte for linefeed, and .5 to automatically round.
my $guessed_record_count = int( ($file_size / ($best_record_length)) + 0.5);

#print "my average_record_length = int( ($file_size / ($total_bytes / $max_records)) + 0.5)\n";
my $average_record_length = int( ($file_size / ($total_bytes / $max_records)) + 0.5);;

    ($debug > 0) && print STDERR "- Estimates based on - mean: $average_record_length; mode: $guessed_record_count\n";
    print "$average_record_length\n";
    #print "$guessed_record_count\n";

#-----------------------------------------------------------------------------

#($debug > 0) && print STDERR Dumper %length_results;

#-----------------------------------------------------------------------------