#!/usr/bin/perl #----------------------------------------------------------------------------- use strict; use warnings; use Data::Dumper; $Data::Dumper::Sortkeys = 1; my %delims = ( 'comma' => ',', 'pipe' => '\|', 'tab' => '\t' ); my %delim_results; # Levels: 0-1 my $debug = 0; my %count_results; #--------------------------------------------------------------------- # Find key of largest value in a hash. # Credit: https://stackoverflow.com/questions/2886872 sub key_of_largest_value (\%$) { my $hash = shift; my $query_type = shift || 'key'; # Start with the first key and value as candidate biggest. my ($key, @keys) = keys(%$hash); my ($big, @vals) = values(%$hash); # FIXME - need tie-breaker: last one wins. # Otherwise, the winner is chosen semi-randomly. for (0 .. $#keys) { if ($vals[$_] > $big) { $big = $vals[$_]; $key = $keys[$_]; } } if ($query_type eq 'key') { return($key); } if ($query_type eq 'value') { return($big); } return($key); } #----------------------------------------------------------------------------- my $record_count = 0; # Count number of delimiters in each line. while (<>) { $record_count++; foreach my $delim (sort keys %delims) { my $delim_count; ($debug > 0) && print "line: $record_count; delim: $delim - "; my @fields = split $delims{$delim}, $_; if ($#fields > 0) { $delim_count = $#fields + 1; } else { $delim_count = 0; } ($debug > 0) && print ($delim_count . "\n"); # Accumulate delimiter-count *frequency*. if ($delim_count > 0) { $count_results{$delim}{$delim_count}++; } } } foreach my $delim (sort keys %delims) { $delim_results{$delim} = key_of_largest_value(%{$count_results{$delim}}, 'key') || 0; } # Get consensus - only if greater than zero and all are equal. # TODO - for large files, get rough consensus. # (assume slight variation or overlap in delimiters) my $best_delim = key_of_largest_value(%delim_results, 'key') || ''; my $best_delim_count = key_of_largest_value(%{$count_results{$best_delim}}, 'key') || 0; my $best_delim_freq = $count_results{$best_delim}{$best_delim_count} || 0; ($debug > 0) && print STDERR "best_delim: $best_delim; best_delim_count: $best_delim_count; best_delim_freq: $best_delim_freq\n"; if ($best_delim_count gt 0) { print "$best_delim\n"; print STDERR "- Most likely separator: $best_delim\n"; print STDERR "- Most likely intended field count: $best_delim_count\n"; if ($record_count != $best_delim_freq) { print STDERR "- WARNING: only $best_delim_freq/$record_count have $best_delim_count ${best_delim}s.\n"; #print STDERR Dumper $count_results{$best_delim}; print STDERR Dumper %count_results; } else { print STDERR "- All $record_count records have $best_delim_count ${best_delim}s.\n"; } } else { print STDERR "- No obvious separator found\n"; print STDERR Dumper %count_results; exit 1 } ($debug > 0) && print STDERR Dumper %count_results; #-----------------------------------------------------------------------------