#!/usr/bin/perl -w
# /=====================================================================\ #
# |  latexmlfind                                                        | #
# | xml search utility program                                          | #
# |=====================================================================| #
# | Part of LaTeXML:                                                    | #
# |  Public domain software, produced as part of work done by the       | #
# |  United States Government & not subject to copyright in the US.     | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
# \=========================================================ooo==U==ooo=/ #
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../lib";
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use LaTeXML::Common::XML;
use Text::Wrap;
use LaTeXML;    # Currently, just for version information.

#**********************************************************************
# Parse command line
my ($verbosity) = (0);
my ($help, $showversion, $SKELETON) = (0, 0, 0);
my (@symbols, @unknowns, @posfuncs, @labels, @refnums);
GetOptions("symbol=s" => \@symbols,,
  "unknown=s"          => \@unknowns,
  "possiblefunction=s" => \@posfuncs,
  "label=s"            => \@labels,
  "refnum=s"           => \@refnums,
  "skeleton"           => \$SKELETON,
  "quiet"              => sub { $verbosity--; },
  "verbose"            => sub { $verbosity++; },
  "VERSION"            => \$showversion,
  "help"               => \$help,
  ) or pod2usage(-message => $LaTeXML::IDENTITY,
  -exitval => 1, -verbose => 0, -output => \*STDERR);
pod2usage(-message => $LaTeXML::IDENTITY, -exitval => 1, -verbose => 2, -output => \*STDOUT)
  if $help;
if ($showversion) { print STDERR "$LaTeXML::IDENTITY\n"; exit(1); }
pod2usage(-message => "$LaTeXML::IDENTITY\nMissing input TeX file",
  -exitval => 1, -verbose => 0, -output => \*STDERR) unless @ARGV;
my $source = $ARGV[0];

#**********************************************************************
# Do the processing.
print STDERR "$LaTeXML::IDENTITY\n" if $verbosity > -1;
binmode(STDOUT, ":encoding(UTF-8)");    # Make sure output can handle UTF8

my $DOC = LaTeXML::Common::XML::Parser->new()->parseFile($source);
my $XPATH = LaTeXML::Common::XML::XPath->new(ltx => "http://dlmf.nist.gov/LaTeXML");
$XPATH->registerFunction('match-font', \&LaTeXML::Common::Font::match_font);

# Objects being labelled sections, equations, etc.
our %OBJECTS = ();
our $ROOT_OBJECT = { type => 'Document',
  subobjects => [], items => [] };

foreach my $symbol (@symbols) {
  collect_matches("Symbols \"$symbol\"",
    "//ltx:Math[descendant::ltx:XMTok[\@name='$symbol' or text()='$symbol']"
      #                        BUT which isn't in presentation branch of an XMDual!!
      #                        ie. no ancestor w/preceding sibling has parent = XMDual
      . "[not(ancestor-or-self::*[preceding-sibling::*][parent::ltx:XMDual])]"

      . "]"); }
foreach my $spec (@unknowns) {
  my $symbol = $spec;
  my $font;
  $font = $1 if $symbol =~ s/\{([\w\s]*)\}$//;
  collect_matches("Unknown \"$spec\"",
    # Find Math containing an XMTok, with role=UNKNOWN
    "//ltx:Math[descendant::ltx:XMTok[\@role='UNKNOWN']"
      #                         whose name or content is the requested symbol
      . "[\@name='$symbol' or text()='$symbol']"
      . ($font ? "[\@font='$font']" : '')
      #                         BUT which isn't in presentation branch of an XMDual!!
      #                        ie. no ancestor w/preceding sibling has parent = XMDual
      . "[not(ancestor-or-self::*[preceding-sibling::*][parent::ltx:XMDual])]"
      . "]"); }

foreach my $symbol (@posfuncs) {
  collect_matches("Possible function \"$symbol\"",
    "//ltx:Math[descendant::ltx:XMTok[\@possibleFunction='yes']"
      . "[\@name='$symbol' or text()='$symbol']"
      . "]"); }

show_matches($ROOT_OBJECT);

#**********************************************************************
# This matches fonts when both are converted to strings (toString),
# such as when they are set as attributes.
sub match_font {
  my ($font1, $font2) = @_;
  #print STDERR "Match font \"".($font1 || 'none')."\" to \"".($font2||'none')."\"\n";
  return 0 unless $font1 && $font2;
  my @comp1;
  my @comp2;
  if ($font1 =~ /^Font\[(.*)\]$/) {
    @comp1 = split(',', $1); }
  if ($font2 =~ /^Font\[(.*)\]$/) {
    @comp2 = split(',', $1); }
  while (@comp1) {
    my $c1 = shift @comp1;
    my $c2 = shift @comp2;
    return 0 if ($c1 ne '*') && ($c2 ne '*') && ($c1 ne $c2); }
  return 1; }
#**********************************************************************

sub collect_matches {
  my ($description, $xpath) = @_;
  my @nodes = $XPATH->findnodes($xpath, $DOC);
  print "Query $description appears in " . scalar(@nodes) . " places\n";
  print "  [XPath = \"$xpath\"]\n" if $verbosity > 0;
  foreach my $node (@nodes) {
    my $object = id_object($node);
    push(@{ $$object{items} }, $node); }
  return; }

sub id_object {
  my ($node) = @_;
  my $id;
  while (1) {
    $node = $node->parentNode;
    return $ROOT_OBJECT if $node->nodeType != XML_ELEMENT_NODE;
    last if $id = $node->getAttribute('xml:id'); }
  if (my $object = $OBJECTS{$id}) {
    return $object; }
  else {
    my $parent_object = id_object($node);
    my $type          = $node->localname;
    my $labels        = $node->getAttribute('labels');
    my $refnum        = $node->getAttribute('refnum');
    my ($title) = $XPATH->findnodes("child::ltx:toctitle | child::ltx:title", $node);
    my $desc = ($refnum ? ($title ? "$refnum. " . $title->textContent : $refnum)
      : ($title ? $title->textContent : ''));
    $desc =~ s/\s+/ /g;
    $OBJECTS{$id} = $object
      = { id => $id, labels => $labels, type => $type, description => $desc, children => [], items => [] };
    push(@{ $$parent_object{children} }, $object);
    return $object; } }

sub show_matches {
  my ($object, $indent) = @_;
  $indent = '' unless defined $indent;
  print $indent. "$$object{type}:"
    . ($$object{id}          ? " ID=$$object{id}"           : '')
    . ($$object{labels}      ? " Labels=$$object{labels}"   : '')
    . ($$object{description} ? " \"$$object{description}\"" : "") . "\n";
  if (!$SKELETON) {
    foreach my $item (@{ $$object{items} }) {
      show_node($item, $indent . ' | '); } }
  foreach my $child (@{ $$object{children} }) {
    show_matches($child, $indent . ' | '); }
  return; }

sub show_node {
  my ($node, $indent) = @_;
  if ($node->localname eq 'Math') {
    my $ptex = $node->getAttribute('tex');
    my $ctex = $node->getAttribute('content-tex');
    if ($verbosity > 1) {
      print $indent. $node->toString . "\n"; }
    else {
      print wrap($indent, $indent, $ctex || $ptex) . "\n"; } }
  else {
    print $indent. $node->toString . "\n"; }
  return; }

#**********************************************************************
__END__

=head1 NAME

C<latexmlfind> - finds interesting things in LaTeXML generated XML.

=head1 SYNOPSIS

latexmlfind [options] xmlfile

  Options:
   --symbol=symbol            finds equations where the symbol appears.
   --unknown=symbol           finds equations where the unknown symbol appears (ie role=UNKNOWN).
   --possiblefunction=symbol  finds equations where symbol is possibly used as a function.
   --label=symbol             finds objects with the given label.
   --refnum=symbol            finds objects with the given refnum (reference number).
   --quiet                    suppress messages (can repeat)
   --verbose                  more informative output (can repeat)
   --VERSION                  show version number.
   --help                     shows help message.

=head1 OPTIONS AND ARGUMENTS

latexmlfind is useful for finding objects within an XML file generated by LaTeXML.


=over 4

=item B<--output=>I<outputfile>

Specifies the output file; by default the XML is written to stdout.

=item B<--unknown=>I<symbol>

Finds equations where the unknown symbol appears.

=item B<--possiblefunction=>I<symbol>

Finds equations where symbol is possibly used as a function.

=item B<--label=>I<label>

Finds objects (sections, equations, whatever) labeled by the given label.

=item B<--refnum=>I<refnum>

Finds objects (sections, equations, whatever) with the given reference number.

=item B<--quiet>

Reduces the verbosity of output during processing, used twice is pretty silent.

=item B<--verbose>

Increases the verbosity of output during processing, used twice is pretty chatty.
Can be useful for getting more details when errors occur.

=item B<--VERSION>

Shows the version number of latexmlfind..

=item B<--help>

Shows this help message.

=back

=head1 SEE ALSO

L<latexml>, L<LaTeXML>

=cut
