For checkouts or to view logs direct your SVN client to svn://svn.saintamh.org/code/alsj2srt/alsj2srt.pl

#!/usr/bin/perl -w
use warnings;
use strict;
$|++;

# file: alsj2srt.pl

# Herve Saint-Amand
# Montreal
# Sat Nov 08 17:51:40 2008

# Reads a radio transmission transcript from the Apollo Lunar Surface Journal
# or the Apollo Flight Journal and converts it to a movie subtitle file.

# Ground Elapsed Times or Mission Times are stored as number of miliseconds
# since liftoff. All measures of time here are in miliseconds.

# TODO
#
#  * save all configs to one file, so that I don't have to figure out the full
#    command-line each time
#  * regression tests, so that after a day of working on some mission I can see
#    whether (and how) I've affected other missions

#------------------------------------------------------------------------------
# includes

use English;
use HTML::Entities;
use List::Util qw/max min sum/;

#------------------------------------------------------------------------------
# constants, global vars

# max width of a line of subtitle, in chars
my $MAX_WIDTH = 40;

# we never display a card for less/more than this
my $MIN_DURATION =  750;
my $MAX_DURATION = 8000;

# estimate of speech rate, in chars per second
my $SPEECH_RATE = 12;

# page-ending punctuation
my $RE_PUNCT = qr/(?:[,\?!;]|\.(?:\.\.)?| -)/;

# we prefer breaking pages just before these words
my @STOPWORDS = qw/but with even at from by/;
my $RE_STOPWORDS = '(?:' . join ('|', @STOPWORDS) . ')';
$RE_STOPWORDS = qr!$RE_STOPWORDS!i;

# phrases used to denote pauses
my %PAUSES = %{{
    'pause'                =>  2_000,
    'long pause'           => 10_000,

    # There is no point in forcing a 30-min blank in the subtitles, so we
    # basically ignore these:
    'comm break'           => 0,
    'long comm break'      => 0, 
    'very long comm break' => 0,
}};
my $RE_PAUSE = '(?:' . join ('|', keys %PAUSES) . ')';
$RE_PAUSE = qr!$RE_PAUSE!i;

# Whether to skip comments marked as 'onboard'
my $SKIP_ONBOARD = 1;

# Whether we delete all text that's between parentheses
my $DELETE_PARENS = 0;

# Whether we ignore (pause)s at the end of statements
my $IGNORE_TRAILING_PAUSE = 0;

# Maps GET timestamps as they appear in the HTML to our corrections
my %GET_MAPPINGS;

# Allows to shift all timestamps by a certain amount
my %RECALIBRATIONS;

my $input_file_name;
my $output_file_name;
my $start_get;
my $video_length;


# This is the parsed info from the input file. Each element is a hashref
# containing GET timestamp, duration (in milis), text, post-pause (if any), etc
my @pages;

#------------------------------------------------------------------------------
# Various configs, depending on which mission it is we're dealing with

my %CONFIGS = %{{

    # Apollo 8 AFJ config
    a8 => {
        'pause' => 2_000,
        'long pause' => 10_000,
        'skip_onboard' => 1,
        'delete_parens' => 1,
    },

    # Apollo 11 ALSJ config
    a11 => {
        'pause' => 2_000,
        'long pause' => 10_000,
        #'ignore_trailing_pause' => 1,
        'delete_parens' => 1,

        recalibrations => {
            '109:19:36' => '-11',
            '109:20:40' => '-38',
            '109:20:56' => '-2',
            '109:21:18' => '-3.5',
            '109:21:39' => '-5',
            '109:24:48' => '-24',
            '109:25:08' => '+3.5',
            '109:25:30' => '+20',
            '109:36:47' => '+5',
            '109:37:00' => '+5',
            '109:37:22' => '-10',
            '109:52:27' => '-10',
            '110:50:00' => 'reset',
            '110:55:42' => '+10',
        },

        get_mappings => {
            '109:44:35' => '109:45:13',
            '109:52:24' => '109:52:09',
            '110:37:58' => '110:38:11',
            '111:25:58' => '111:26:18',
        },
    },

    a11_flight => {
        recalibrations => {
            '055:19:49' => '-64',
            '055:50:01' => '-7',
            '056:03:44' => '-4',
        },
    },

    a15 => {
        get_mappings => {
            '122:38:53' => '122:39:09',
            '122:40:28' => '122:40:21',
        },
    },

}};

sub apply_config {
    my ($selected_config_id) = @_;
    my $selected_config = $CONFIGS{$selected_config_id} ||
        die "Undefined config '$selected_config_id'";
    foreach my $key (keys %{$selected_config}) {
        my $val = $selected_config->{$key};
        if ($key =~ /^(?:long )?pause$/) {
            $PAUSES{$key} = $val;
        } elsif ($key eq 'skip_onboard') {
            $SKIP_ONBOARD = $val;
        } elsif ($key eq 'delete_parens') {
            $DELETE_PARENS = $val;
        } elsif ($key eq 'ignore_trailing_pause') {
            $IGNORE_TRAILING_PAUSE = $val;
        } elsif ($key eq 'recalibrations') {
            %RECALIBRATIONS = %$val;
        } elsif ($key eq 'get_mappings') {
            %GET_MAPPINGS = %$val;
        } else {
            die "Unknown config option '$key'";
        }
    }
}

#------------------------------------------------------------------------------
# utils

sub scan {
    my ($f) = @_;
    my $i = 0;
    while ($i < @pages) {
        my @new = $f->($pages[$i]);
        splice (@pages, $i, 1, @new);
        $i += @new;
    }
}

sub page_dur {
    my ($page) = @_;
    return 1000 * (length $page->{text}) / $SPEECH_RATE;
}

sub str2get {
    my ($str) = @_;
    $str =~ /^(\d+):(\d+):(\d+)(?::\d+)?$/ or return undef;
    my ($h, $m, $s, $ms) = ($1, $2, $3, $4);
    $ms ||= 0;
    return
        $h * 1000*60*60 +
        $m * 1000*60 +
        $s * 1000 +
        $ms;
}

sub get2str {
    my ($get) = @_;
    return sprintf '%03d:%02d:%02d,%03d',
        int ($get / (1000*60*60)),
        ($get / (1000*60)) % 60,
        ($get / 1000) % 60,
        $get % 1000;
}

#------------------------------------------------------------------------------
# parse command line

sub usage {
    my ($mesg) = @_;
    (my $me = $0) =~ s!.+/!!;

    print STDERR "$mesg\n\n" if $mesg;
    print STDERR
        "usage: $me [-c <config>] <input.html> <output.srt> <start-GET>" .
        " [video-length]\n" .
        "    start-GET : the Ground Elapsed Time (as indicated in ALSJ)\n" .
        "        at which the video starts. Note that you can use milisecs\n" .
        "    video-length: length of the output, in seconds.\n" .
        "e.g.:  $me -c a18 07day2_maroon.htm 31:10:43:100\n";
    exit 1;
}

if (@ARGV > 2 && $ARGV[0] eq '-c') {
    &apply_config ($ARGV[1]);
    splice (@ARGV, 0, 2);
}

&usage unless (@ARGV == 3 || @ARGV == 4);
$input_file_name = shift;
$output_file_name = shift;
$start_get = &str2get (shift) ||
    &usage ("Counldn't parse start-GET");
$video_length = 1000*shift if @ARGV;

# avoid commin annoying typo
if ($output_file_name !~ /\.srt$/) {
    print "Specified output file not an SRT file, is that OK? [yn] ";
    my $ans = <STDIN> || exit;
    exit unless ($ans =~ /^y/i);
}

#------------------------------------------------------------------------------
# Parse input file

# In this first pass we only parse the basic info into @pages, but @pages will
# end up with pieces of text way too long to display on one card, and other
# problems. For now we're just basically translating the HTML file into our own
# struct.

# Read file
open (FILE_IN, '<:encoding(ISO-8859-1)', $input_file_name) ||
    die "Can't read '$input_file_name': $!";
local $/;
my $html = <FILE_IN>;
close FILE_IN;

# Remove some comments. This would have been optional given how we parse below,
# but avoids some mis-indentifications
$html =~ s!<blockquote><i>.+?</i>(?:</\w+>)*</blockquote>!!sig;

# prepare for recalibrations
my $RECAL_MSECS = 0;
my @RECALS = map {
    my ($key,$raw_value) = ($_,$RECALIBRATIONS{$_});
    my $value;
    if ($raw_value eq 'reset') {
        $value = $raw_value;
    } else {
        my ($sign, $float_val) = ($raw_value =~ /^([\+\-])(\d+(?:\.\d+)?)$/);
        $value = ($sign eq '-' ? -1 : 1) * int($float_val * 1000);
    }
    {
        time => &str2get($key),
        value => $value,
    };
} sort keys %RECALIBRATIONS;

# we split lines ourselves because we have to deal with a mixture or line
# endings (AFJ is DOS, ALSJ is Mac)
while ($html =~ s/^(.*?)(?:\r\n|\r|\n)//) {
    my $line = $1;

    # 2010.01.07 - I can't think of a valid reason for actually using the line
    # breaks in the HTML as a guide for parsing, but we do. Here we need to
    # remove some spurious line breaks.
    $line .= " $1" while ($html =~ s/^\s*([^\s<>].*?)(?:\r\n|\r|\n)//);

    # de-HTML-ify the source
    $line =~ s/<[^>]+>//g;
    $line = decode_entities ($line);

    # extract info from line
    if ($line =~ m!(\d+:\d+:[\d|x]+)\s*(.+?):\s*(.+?)\s*$!) {
        my ($raw_get, $speaker, $text) = ($1, $2, $3);
        $raw_get =~ s/x/0/g;

        # parse GET, apply recalibarations. Messy code
        $raw_get = $GET_MAPPINGS{$raw_get} if $GET_MAPPINGS{$raw_get};
        my $get = &str2get ($raw_get);
        while (@RECALS && $get >= $RECALS[0]->{time}) {
            my $r = shift @RECALS;
            if ($r->{value} eq 'reset') {
                $RECAL_MSECS = 0;
            } else {
                $RECAL_MSECS += $r->{value};
            }
        }
        $get += $RECAL_MSECS;

        # check and format speaker name
        next if ($SKIP_ONBOARD && $speaker =~ /onboard/i);
        $speaker =~ s/(?<=\w)\s*\([\w\s]+\)\s*$//;

        # fix typos in the hour
        if (@pages) {
            my $prev_get = $pages[-1]->{block_get};
            foreach my $d (-2 .. 2) {
                my $fix = $d*1000*60*60;
                $get += $fix if (abs ($get - $prev_get + $fix) < 1000*60);
            }
        }

        # make sure we're within time limits
        next unless ($get >= $start_get);
        last if ($video_length && $get > ($start_get + $video_length));

        # remove some stuff we ignore
        $text =~ s{
            ([\[\(]) \s*
            (?:no\s+answer|stops\s+himself)
            \s* (?:[.;,\-]\s*)?
            ([^]]*)
            ([\]\)])
        }{$2 ? "$1$2$3" : ''}giex;

        # basic fixes to text
        $text =~ s/\s+/ /g;
        $text =~ s/\s+(?=[\.,;\!\?])//g;
        $text =~ s/(?<=\w)\.{3,}\s*(?=\w)/... /g;
        $text =~ s/^ //g;
        $text =~ s/ $//g;
        next unless ($text =~ /\S/);

        # save squeletal info
        push (@pages, {
            block     => scalar @pages,
            block_get => $get,
            speaker   => $speaker,
            text      => "$speaker: $text",
        });

    } elsif (@pages && $line =~ m!.+?\s+\(continued\)\s*:\s*(.+?)\s*$!) {
        $pages[-1]->{text} .= ' ' . $1;
    }
}

#------------------------------------------------------------------------------
# Split long pages, fix durations

# So far @pages contains 'blocks' of text, which can be several paragraphs long,
# and may also contain pause markers that influence timing and are not printed.
# We need to split these into 'pages' of subtitles, which can be displayed one
# at a time, and time these pages approproately.


# Introduce certain nonbreaking spaces before starting the splitting
$_->{text} =~ s/(Apollo)\s+(\d+)/$1~$2/gi foreach @pages;


# First, split where there are pauses
&scan (sub {
    my $p = shift;
    my @split;

    while ($p->{text} =~ s{^
        (.+?) \s*
        (?: $ |
         [\[\(] \s* ($RE_PAUSE) \s* (?:\.\s*)? [\]\)] \s*
         )
    }{}ox) {
        my ($text, $pause) = ($1, $2 ? $PAUSES{lc $2} : 0);

        # Now that we've identified pauses, we can remove the parentheses
        $text =~ s{(\s*)\((.+?)\)(\s*)($RE_PUNCT?)}{
            my ($prespace, $text, $postspace, $postpunct) = ($1, $2, $3, $4);
            $text = ($text =~ /^\s*garbled/i ? '(garbled)' : '');
            $postspace = '' if ($postpunct || ($prespace && !$text));
            $prespace  = '' if ($postpunct);
            $prespace . $text . $postspace . $postpunct;
        }ge if $DELETE_PARENS;

        # After this step, @split contains arrayrefs where the GET and duration
        # are undef. Also, the text hasn't been split to lines and may still be
        # too long to fit on one page.
        my %p = %$p;
        $p{text}       = $text;
        $p{post_pause} = $pause;
        push (@split, \%p);
    }

    # Ignore the pause at the end of a pre-split page, since the next page will
    # contain GET info anyway.
    $split[-1]{post_pause} = undef if $IGNORE_TRAILING_PAUSE;

    return @split;
});


# Repeat speaker name after long pauses
for my $i (1 .. $#pages) {
    $pages[$i]->{text} =~ s/^/$pages[$i]->{speaker}: /
        if ($pages[$i-1]->{post_pause} > 4_000 &&
            $pages[$i]->{text} !~ /^\Q$pages[$i]->{speaker}: \E/);
}


# Further split the text into lines, also splitting pages that are too long to
# fit on one page (more than 2 lines).
my $RE_LINE = qr!(?:.{1,$MAX_WIDTH})!;
&scan (sub {
    my $p = shift;
    my @split;

    while (
           # try to match an ideal-sized page ending with punctation
           $p->{text} =~ s/^($RE_LINE)\s+($RE_LINE(?<=\w)$RE_PUNCT)\s*//o ||
           # then an ideal-sized page followed by a page-starting word
           $p->{text} =~ s/^($RE_LINE)\s+($RE_LINE)\s+(?=$RE_STOPWORDS)//o ||
           # then we just fill in as many words as we can
           $p->{text} =~ s/^($RE_LINE)\s+($RE_LINE)\s+//o ||
           # and if nothing else, brute-force it
           $p->{text} =~ s/^($RE_LINE)($RE_LINE?)//o)
    {
        my @lines = ($1, $2);

        # don't break text that fits on one line
        @lines = (join (' ', @lines))
            if ($lines[1] && length (join (' ', @lines)) < $MAX_WIDTH);

        # equalize line lengths
        while ($lines[1] &&
               (length $lines[0]) > (length $lines[1]) &&
               $lines[0] !~ /^\S:\s+\S$/ &&
               $lines[0] =~ /\s+(\S+)$/ &&
               (length "$1 $lines[1]") < $MAX_WIDTH)
        {
            $lines[0] =~ s/\s+(\S+)$//;
            $lines[1] = "$1 $lines[1]";
        }

        my %p = %$p;
        $p{text} = $lines[0] . (defined $lines[1] ? "\n$lines[1]" : '');
        push (@split, \%p);
    }

    # only the last page carries the post-pause
    $split[$_]->{post_pause} = 0 foreach (0 .. $#split-1);

    return @split;
});


# Make nonbreakable spaces normal again
$_->{text} =~ s/~/ /g foreach @pages;


# For each block in the source file, compare our estimate of how long it should
# last (based on our estimate of speech rate and pause times) with how long we
# have until the next block starts. If the former is longer than the other, we
# need to 'compress' time a bit, going faster than we would if there was
# nothing after.
my %block_gets = map {$_->{block} => $_->{block_get}} @pages;
my @block_ids  = sort {$a <=> $b} keys %block_gets;
my %compression;
foreach my $i (0 .. $#block_ids-1) {
    my ($block_id, $next_block_id) = ($block_ids[$i], $block_ids[$i+1]);
    my @block_pages = grep $_->{block} eq $block_id, @pages;

    # This is how long we would normally use for this block
    my $ideal_dur = sum map {&page_dur($_) + $_->{post_pause}} @block_pages;

    # This is how long we have until the next block starts
    my $avail_dur = $block_gets{$next_block_id} - $block_gets{$block_id};

    $compression{$block_id} = ($avail_dur < $ideal_dur) ?
        $avail_dur / $ideal_dur :
        1;
}


# Assign a GET and duration to each page
foreach my $page (@pages) {
    my $b = $page->{block};
    my $c = $compression{$b} || 1;

    my $page_dur = $c * &page_dur($page);
    $page_dur = max ($MIN_DURATION, min ($MAX_DURATION, $page_dur));

    $page->{get} = $block_gets{$b};
    $page->{dur} = $page_dur;
    $block_gets{$b} += $page_dur + $c*$page->{post_pause};
}

#------------------------------------------------------------------------------
# export as SRT file

open (FILE_OUT, '>:encoding(ISO-8859-1)', $output_file_name) ||
    die "Can't write '$output_file_name': $!";

foreach my $i (0 ..$#pages) {
    my $p = $pages[$i];
    printf FILE_OUT "%d\n%s --> %s\n%s\n\n",
        $i,
        &get2str ($p->{get} - $start_get),
        &get2str ($p->{get} + $p->{dur} - $start_get),
        $p->{text};
}

close FILE_OUT;

#------------------------------------------------------------------------------