httpd/cgi-bin/check
author Yves Lafon <ylafon@w3.org>
Thu, 26 Mar 2015 22:10:34 +0100
changeset 3303 35b522f708d2
parent 3296 0907055456b2
permissions -rwxr-xr-x
protocol relative link for icons
     1 #!/usr/bin/perl -T
     2 #
     3 # W3C Markup Validation Service
     4 # A CGI script to retrieve and validate a markup file
     5 #
     6 # Copyright 1995-2013 World Wide Web Consortium, (Massachusetts
     7 # Institute of Technology, European Research Consortium for Informatics
     8 # and Mathematics, Keio University). All Rights Reserved.
     9 #
    10 # Originally written by Gerald Oskoboiny <gerald@w3.org>
    11 # for additional contributors, see
    12 # http://dvcs.w3.org/hg/markup-validator/shortlog/tip and
    13 # http://validator.w3.org/about.html#credits
    14 #
    15 # This source code is available under the license at:
    16 #     http://www.w3.org/Consortium/Legal/copyright-software
    17 
    18 #
    19 # We need Perl 5.8.0+.
    20 use 5.008;
    21 
    22 ###############################################################################
    23 #### Load modules. ############################################################
    24 ###############################################################################
    25 
    26 #
    27 # Pragmas.
    28 use strict;
    29 use warnings;
    30 use utf8;
    31 
    32 package W3C::Validator::MarkupValidator;
    33 
    34 #
    35 # Modules.  See also the BEGIN block further down below.
    36 #
    37 # Version numbers given where we absolutely need a minimum version of a given
    38 # module (gives nicer error messages). By default, add an empty import list
    39 # when loading modules to prevent non-OO or poorly written modules from
    40 # polluting our namespace.
    41 #
    42 
    43 # Need 3.40 for query string and path info fixes, #4365
    44 use CGI 3.40 qw(-newstyle_urls -private_tempfiles redirect);
    45 use CGI::Carp qw(carp croak fatalsToBrowser);
    46 use Config qw(%Config);
    47 use Config::General 2.32 qw();    # Need 2.32 for <msg 0>, rt.cpan.org#17852
    48 use Encode qw();
    49 use Encode::Alias qw();
    50 use Encode::HanExtra qw();        # for some chinese character encodings,
    51                                   # e.g gb18030
    52 use File::Spec::Functions qw(catfile rel2abs tmpdir);
    53 use HTML::Encoding 0.52 qw();
    54 use HTML::HeadParser 3.60 qw();    # Needed for HTML5 meta charset workaround
    55 use HTML::Parser 3.24 qw();        # Need 3.24 for $p->parse($code_ref)
    56 use HTML::Template qw();           # Need 2.6 for path param, other things.
    57                                    # Specifying 2.6 would break with 2.10,
    58                                    # rt.cpan.org#70190
    59 use HTTP::Headers::Util qw();
    60 use HTTP::Message 1.52 qw();       # Need 1.52 for decoded_content()
    61 use HTTP::Request qw();
    62 use HTTP::Headers::Auth qw();      # Needs to be imported after other HTTP::*.
    63 use JSON 2.00 qw();
    64 use SGML::Parser::OpenSP 0.991 qw();
    65 use URI 1.53 qw();                 # Need 1.53 for secure()
    66 use URI::Escape qw(uri_escape);
    67 use URI::file;
    68 use URI::Heuristic qw();
    69 
    70 ###############################################################################
    71 #### Constant definitions. ####################################################
    72 ###############################################################################
    73 
    74 #
    75 # Define global constants
    76 use constant TRUE  => 1;
    77 use constant FALSE => 0;
    78 
    79 #
    80 # Tentative Validation Severities.
    81 use constant T_WARN  => 4;    # 0000 0100
    82 use constant T_ERROR => 8;    # 0000 1000
    83 
    84 #
    85 # Define global variables.
    86 use vars qw($DEBUG $CFG %RSRC $VERSION);
    87 $VERSION = '1.3';
    88 
    89 use constant IS_MODPERL2 =>
    90     (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
    91 
    92 #
    93 # Things inside BEGIN don't happen on every request in persistent environments
    94 # (such as mod_perl); so let's do the globals, eg. read config, here.
    95 BEGIN {
    96 
    97     my $base = $ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator';
    98 
    99     # Launder data for -T; -AutoLaunder doesn't catch this one.
   100     if ($base =~ /^(.*)$/) {
   101         $base = $1;
   102     }
   103 
   104     #
   105     # Read Config Files.
   106     eval {
   107         my %config_opts = (
   108             -ConfigFile =>
   109                 ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'),
   110             -MergeDuplicateOptions => TRUE,
   111             -MergeDuplicateBlocks  => TRUE,
   112             -SplitPolicy           => 'equalsign',
   113             -UseApacheInclude      => TRUE,
   114             -IncludeRelative       => TRUE,
   115             -InterPolateVars       => TRUE,
   116             -AutoLaunder           => TRUE,
   117             -AutoTrue              => TRUE,
   118             -CComments             => FALSE,
   119             -DefaultConfig         => {
   120                 Protocols => {Allow => 'http,https'},
   121                 Paths     => {
   122                     Base  => $base,
   123                     Cache => '',
   124                 },
   125                 External => {HTML5 => FALSE,},
   126             },
   127         );
   128         my %cfg = Config::General->new(%config_opts)->getall();
   129         $CFG = \%cfg;
   130     };
   131     if ($@) {
   132         die <<"EOF";
   133 Could not read configuration.  Set the W3C_VALIDATOR_CFG environment variable
   134 or copy conf/* to /etc/w3c/. Make sure that the configuration file and all
   135 included files are readable by the web server user. The error was:\n'$@'
   136 EOF
   137     }
   138 
   139     #
   140     # Check paths in config
   141     # @@FIXME: This does not do a very good job error-message-wise if
   142     # a path is missing...
   143     {
   144         my %paths = map { $_ => [-d $_, -r _] } $CFG->{Paths}->{Base},
   145             $CFG->{Paths}->{Templates}, $CFG->{Paths}->{SGML}->{Library};
   146         my @_d = grep { not $paths{$_}->[0] } keys %paths;
   147         my @_r = grep { not $paths{$_}->[1] } keys %paths;
   148         die "Does not exist or is not a directory: @_d\n"       if scalar(@_d);
   149         die "Directory not readable (permission denied): @_r\n" if scalar(@_r);
   150     }
   151 
   152     #
   153     # Split allowed protocols into a list.
   154     if (my $allowed = delete($CFG->{Protocols}->{Allow})) {
   155         $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)];
   156     }
   157 
   158     # Split available languages into a list
   159     if (my $langs = delete($CFG->{Languages})) {
   160         $CFG->{Languages} = [split(/\s+/, $langs)];
   161     }
   162     else {
   163 
   164         # Default to english
   165         $CFG->{Languages} = ["en"];
   166     }
   167 
   168     {    # Make types config indexed by FPI.
   169         my $types = {};
   170         while (my ($key, $value) = each %{$CFG->{Types}}) {
   171             $types->{$CFG->{Types}->{$key}->{PubID}} = $value;
   172         }
   173         $CFG->{Types} = $types;
   174     }
   175 
   176     #
   177     # Change strings to internal constants in MIME type mapping.
   178     while (my ($key, $value) = each %{$CFG->{MIME}}) {
   179         $CFG->{MIME}->{$key} = 'TBD'
   180             unless ($value eq 'SGML' || $value eq 'XML');
   181     }
   182 
   183     #
   184     # Register Encode aliases.
   185     while (my ($key, $value) = each %{$CFG->{Charsets}}) {
   186         Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/);
   187     }
   188 
   189     #
   190     # Set debug flag.
   191     if ($CFG->{'Allow Debug'}) {
   192         $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'};
   193     }
   194     else {
   195         $DEBUG = FALSE;
   196     }
   197 
   198     # Read friendly error message file
   199     # 'en_US' should be replaced by $lang for lang-neg
   200     %RSRC = Config::General->new(
   201         -MergeDuplicateBlocks => 1,
   202         -ConfigFile =>
   203             catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'),
   204     )->getall();
   205 
   206     eval {
   207         local $SIG{__DIE__} = undef;
   208         require Encode::JIS2K;    # for optional extra Japanese encodings
   209     };
   210 
   211     # Tell libxml to load _only_ our XML catalog.  This is because our entity
   212     # load jailing may trap the libxml internal default catalog (which is
   213     # automatically loaded).  Preventing loading that from the input callback
   214     # will cause libxml to not see the document content at all but to throw
   215     # weird "Document is empty" errors, at least as of XML::LibXML 1.70 and
   216     # libxml 2.7.7.  XML_CATALOG_FILES needs to be in effect at XML::LibXML
   217     # load time which is why we're using "require" here instead of pulling it
   218     # in with "use" as usual.  And finally, libxml should have support for
   219     # SGML open catalogs but they don't seem to work (again as of 1.70 and
   220     # 2.7.7); if we use xml.soc here, no entities seem to end up being resolved
   221     # from it - so we use a (redundant) XML catalog which works.
   222     # Note that setting XML_CATALOG_FILES here does not seem to work with
   223     # mod_perl (it doesn't end up being used by XML::LibXML), therefore we do
   224     # it in the mod_perl/startup.pl startup file for it too.
   225     local $ENV{XML_CATALOG_FILES} =
   226         catfile($CFG->{Paths}->{SGML}->{Library}, 'catalog.xml');
   227     require XML::LibXML;
   228     XML::LibXML->VERSION(1.73);    # Need 1.73 for rt.cpan.org #66642
   229 
   230 }    # end of BEGIN block.
   231 
   232 #
   233 # Get rid of (possibly insecure) $PATH.
   234 delete $ENV{PATH};
   235 
   236 #@@DEBUG: Dump $CFG datastructure. Used only as a developer aid.
   237 #use Data::Dumper qw(Dumper);
   238 #print Dumper($CFG);
   239 #exit;
   240 #@@DEBUG;
   241 
   242 ###############################################################################
   243 #### Process CGI variables and initialize. ####################################
   244 ###############################################################################
   245 
   246 #
   247 # Create a new CGI object.
   248 my $q = CGI->new();
   249 
   250 #
   251 # The data structure that will hold all session data.
   252 # @@FIXME This can't be my() as $File will sooner or
   253 # later be undef and add_warning will cause the script
   254 # to die. our() seems to work but has other problems.
   255 # @@FIXME Apparently, this must be set to {} also,
   256 # otherwise the script might pick up an old object
   257 # after abort_if_error_flagged under mod_perl.
   258 our $File = {};
   259 
   260 #################################
   261 # Initialize the datastructure. #
   262 #################################
   263 
   264 #
   265 # Charset data (casing policy: lowercase early).
   266 $File->{Charset}->{Use}      = ''; # The charset used for validation.
   267 $File->{Charset}->{Auto}     = ''; # Autodetection using XML rules (Appendix F)
   268 $File->{Charset}->{HTTP}     = ''; # From HTTP's "charset" parameter.
   269 $File->{Charset}->{META}     = ''; # From HTML's <meta http-equiv>.
   270 $File->{Charset}->{XML}      = ''; # From the XML Declaration.
   271 $File->{Charset}->{Override} = ''; # From CGI/user override.
   272 
   273 #
   274 # Misc simple types.
   275 $File->{Mode} =
   276     'DTD+SGML';    # Default parse mode is  DTD validation in SGML mode.
   277 
   278 # By default, perform validation (we may perform only xml-wf in some cases)
   279 $File->{XMLWF_ONLY} = FALSE;
   280 
   281 #
   282 # Listrefs.
   283 $File->{Warnings}   = [];    # Warnings...
   284 $File->{Namespaces} = [];    # Other (non-root) Namespaces.
   285 $File->{Parsers}    = [];    # Parsers used {name, link, type, options}
   286 
   287 # By default, doctype-less documents cannot be valid
   288 $File->{"DOCTYPEless OK"}             = FALSE;
   289 $File->{"Default DOCTYPE"}->{"HTML"}  = 'HTML 4.01 Transitional';
   290 $File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional';
   291 
   292 ###############################################################################
   293 #### Generate Template for Result. ############################################
   294 ###############################################################################
   295 
   296 # first we determine the chosen language based on
   297 # 1) lang argument given as parameter (if this language is available)
   298 # 2) HTTP language negotiation between variants available and user-agent choices
   299 # 3) English by default
   300 my $lang = $q->param('lang') || '';
   301 my @localizations;
   302 foreach my $lang_available (@{$CFG->{Languages}}) {
   303     if ($lang eq $lang_available) {
   304 
   305         # Requested language (from parameters) is available, just use it
   306         undef @localizations;
   307         last;
   308     }
   309     push @localizations,
   310         [
   311         $lang_available, 1,               'text/html', undef,
   312         'utf-8',         $lang_available, undef
   313         ];
   314 }
   315 
   316 # If language is not chosen yet, use HTTP-based negotiation
   317 if (@localizations) {
   318     require HTTP::Negotiate;
   319     $lang = HTTP::Negotiate::choose(\@localizations);
   320 }
   321 
   322 # HTTP::Negotiate::choose may return undef e.g if sent Accept-Language: en;q=0
   323 $lang ||= 'en_US';
   324 
   325 if ($lang eq "en") {
   326     $lang = 'en_US';    # legacy
   327 }
   328 
   329 $File->{Template_Defaults} = {
   330     die_on_bad_params => FALSE,
   331     loop_context_vars => TRUE,
   332     global_vars       => TRUE,
   333     case_sensitive    => TRUE,
   334     path              => [catfile($CFG->{Paths}->{Templates}, $lang)],
   335     filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); },
   336 };
   337 if (IS_MODPERL2()) {
   338     $File->{Template_Defaults}->{cache} = TRUE;
   339 }
   340 elsif ($CFG->{Paths}->{Cache}) {
   341     $File->{Template_Defaults}->{file_cache} = TRUE;
   342     $File->{Template_Defaults}->{file_cache_dir} =
   343         rel2abs($CFG->{Paths}->{Cache}, tmpdir());
   344 }
   345 
   346 undef $lang;
   347 
   348 #########################################
   349 # Populate $File->{Opt} -- CGI Options. #
   350 #########################################
   351 
   352 #
   353 # Preprocess the CGI parameters.
   354 $q = &prepCGI($File, $q);
   355 
   356 #
   357 # Set session switches.
   358 $File->{Opt}->{Outline}        = $q->param('outline') ? TRUE : FALSE;
   359 $File->{Opt}->{'Show Source'}  = $q->param('ss')      ? TRUE : FALSE;
   360 $File->{Opt}->{'Show Tidy'}    = $q->param('st')      ? TRUE : FALSE;
   361 $File->{Opt}->{Verbose}        = $q->param('verbose') ? TRUE : FALSE;
   362 $File->{Opt}->{'Group Errors'} = $q->param('group')   ? TRUE : FALSE;
   363 $File->{Opt}->{Debug}          = $q->param('debug')   ? TRUE : FALSE;
   364 $File->{Opt}->{No200}          = $q->param('No200')   ? TRUE : FALSE;
   365 $File->{Opt}->{Prefill}        = $q->param('prefill') ? TRUE : FALSE;
   366 $File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') || 'html401';
   367 $File->{Opt}->{Charset} = lc($q->param('charset') || '');
   368 $File->{Opt}->{DOCTYPE} = $q->param('doctype') || '';
   369 
   370 $File->{Opt}->{'User Agent'} =
   371     $q->param('user-agent') &&
   372     $q->param('user-agent') ne "1" ? $q->param('user-agent') :
   373                                      "W3C_Validator/$VERSION " . $CFG->{'User Agent Info'};
   374 $File->{Opt}->{'User Agent'} =~ tr/\x00-\x09\x0b\x0c-\x1f//d;
   375 
   376 if ($File->{Opt}->{'User Agent'} eq 'mobileok') {
   377     $File->{Opt}->{'User Agent'} =
   378         'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)';
   379 }
   380 
   381 $File->{Opt}->{'Accept Header'}          = $q->param('accept')          || '';
   382 $File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') || '';
   383 $File->{Opt}->{'Accept-Charset Header'}  = $q->param('accept-charset')  || '';
   384 $File->{Opt}->{$_} =~ tr/\x00-\x09\x0b\x0c-\x1f//d
   385     for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header');
   386 
   387 #
   388 # "Fallback" info for Character Encoding (fbc), Content-Type (fbt),
   389 # and DOCTYPE (fbd). If TRUE, the Override values are treated as
   390 # Fallbacks instead of Overrides.
   391 $File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE;
   392 $File->{Opt}->{FB}->{Type}    = $q->param('fbt') ? TRUE : FALSE;
   393 $File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE;
   394 
   395 #
   396 # If ";debug" was given, let it overrule the value from the config file,
   397 # regardless of whether it's "0" or "1" (on or off), but only if config
   398 # allows the debugging options.
   399 if ($CFG->{'Allow Debug'}) {
   400     $DEBUG = $q->param('debug') if defined $q->param('debug');
   401     $File->{Opt}->{Verbose} = TRUE if $DEBUG;
   402 }
   403 else {
   404     $DEBUG = FALSE;    # The default.
   405 }
   406 $File->{Opt}->{Debug} = $DEBUG;
   407 
   408 &abort_if_error_flagged($File);
   409 
   410 #
   411 # Get the file and metadata.
   412 if ($q->param('uploaded_file')) {
   413     $File = &handle_file($q, $File);
   414 }
   415 elsif ($q->param('fragment')) {
   416     $File = &handle_frag($q, $File);
   417 }
   418 elsif ($q->param('uri')) {
   419     $File = &handle_uri($q, $File);
   420 }
   421 
   422 #
   423 # Abort if an error was flagged during initialization.
   424 &abort_if_error_flagged($File);
   425 
   426 #
   427 # Get rid of the CGI object.
   428 undef $q;
   429 
   430 #
   431 # We don't need STDIN any more, so get rid of it to avoid getting clobbered
   432 # by Apache::Registry's idiotic interference under mod_perl.
   433 untie *STDIN;
   434 
   435 ###############################################################################
   436 #### Output validation results. ###############################################
   437 ###############################################################################
   438 
   439 if (!$File->{ContentType} && !$File->{'Direct Input'} && !$File->{'Is Upload'})
   440 {
   441     &add_warning('W08', {});
   442 }
   443 
   444 $File = find_encodings($File);
   445 
   446 #
   447 # Decide on a charset to use (first part)
   448 #
   449 if ($File->{Charset}->{HTTP}) {    # HTTP, if given, is authoritative.
   450     $File->{Charset}->{Use} = $File->{Charset}->{HTTP};
   451 }
   452 elsif ($File->{ContentType} =~ m(^text/(?:[-.a-zA-Z0-9]\+)?xml$)) {
   453 
   454     # Act as if $http_charset was 'us-ascii'. (MIME rules)
   455     $File->{Charset}->{Use} = 'us-ascii';
   456 
   457     &add_warning(
   458         'W01',
   459         {   W01_upload => $File->{'Is Upload'},
   460             W01_agent  => $File->{Server},
   461             W01_ct     => $File->{ContentType},
   462         }
   463     );
   464 
   465 }
   466 elsif ($File->{Charset}->{XML}) {
   467     $File->{Charset}->{Use} = $File->{Charset}->{XML};
   468 }
   469 elsif ($File->{BOM} &&
   470     $File->{BOM} == 2 &&
   471     $File->{Charset}->{Auto} =~ /^utf-16[bl]e$/)
   472 {
   473     $File->{Charset}->{Use} = 'utf-16';
   474 }
   475 elsif ($File->{ContentType} =~ m(^application/(?:[-.a-zA-Z0-9]+\+)?xml$)) {
   476     $File->{Charset}->{Use} = "utf-8";
   477 }
   478 elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) {
   479     $File->{Charset}->{Use} = 'utf-8';    # UTF-8 (image/svg+xml etc.)
   480 }
   481 $File->{Charset}->{Use} ||= $File->{Charset}->{META};
   482 
   483 #
   484 # Handle any Fallback or Override for the charset.
   485 if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) {
   486 
   487     # charset=foo was given to the CGI and it wasn't "autodetect" or empty.
   488     #
   489     # Extract the user-requested charset from CGI param.
   490     my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2);
   491     $File->{Charset}->{Override} = lc($override);
   492 
   493     if ($File->{Opt}->{FB}->{Charset}) {    # charset fallback mode
   494         unless ($File->{Charset}->{Use})
   495         {    # no charset detected, actual fallback
   496             &add_warning('W02', {W02_charset => $File->{Charset}->{Override}});
   497             $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
   498             $File->{Charset}->{Use} = $File->{Charset}->{Override};
   499         }
   500     }
   501     else {                                    # charset "hard override" mode
   502         if (!$File->{Charset}->{Use}) {       # overriding "nothing"
   503             &add_warning(
   504                 'W04',
   505                 {   W04_charset  => $File->{Charset}->{Override},
   506                     W04_override => TRUE
   507                 }
   508             );
   509             $File->{Tentative} |= T_ERROR;
   510             $File->{Charset}->{Use} = $File->{Charset}->{Override};
   511         }
   512         elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) {
   513 
   514             # Actually overriding something; warn about override.
   515             &add_warning(
   516                 'W03',
   517                 {   W03_use => $File->{Charset}->{Use},
   518                     W03_opt => $File->{Charset}->{Override}
   519                 }
   520             );
   521             $File->{Tentative} |= T_ERROR;
   522             $File->{Charset}->{Use} = $File->{Charset}->{Override};
   523         }
   524     }
   525 }
   526 
   527 if ($File->{'Direct Input'}) {    #explain why UTF-8 is forced
   528     &add_warning('W28', {});
   529 }
   530 unless ($File->{Charset}->{XML} || $File->{Charset}->{META})
   531 {                                 #suggest character encoding info within doc
   532     &add_warning('W27', {});
   533 }
   534 
   535 #
   536 # Abort if an error was flagged while finding the encoding.
   537 &abort_if_error_flagged($File);
   538 
   539 $File->{Charset}->{Default} = FALSE;
   540 unless ($File->{Charset}->{Use}) {    # No charset given...
   541     $File->{Charset}->{Use}     = 'utf-8';
   542     $File->{Charset}->{Default} = TRUE;
   543     $File->{Tentative} |= T_ERROR;    # Can never be valid.
   544     &add_warning('W04', {W04_charset => "UTF-8"});
   545 }
   546 
   547 # Always transcode, even if the content claims to be UTF-8
   548 $File = transcode($File);
   549 
   550 # Try guessing if it didn't work out
   551 if ($File->{ContentType} eq 'text/html' && $File->{Charset}->{Default}) {
   552     my $also_tried = 'UTF-8';
   553     for my $cs (qw(windows-1252 iso-8859-1)) {
   554         last unless $File->{'Error Flagged'};
   555         $File->{'Error Flagged'} = FALSE;    # reset
   556         $File->{Charset}->{Use} = $cs;
   557         &add_warning('W04',
   558             {W04_charset => $cs, W04_also_tried => $also_tried});
   559         $File = transcode($File);
   560         $also_tried .= ", $cs";
   561     }
   562 }
   563 
   564 # if it still does not work, we abandon hope here
   565 &abort_if_error_flagged($File);
   566 
   567 #
   568 # Add a warning if doc is UTF-8 and contains a BOM.
   569 if ($File->{Charset}->{Use} eq 'utf-8' &&
   570     @{$File->{Content}} &&
   571     $File->{Content}->[0] =~ m(^\x{FEFF}))
   572 {
   573     &add_warning('W21', {});
   574 }
   575 
   576 #
   577 # Overall parsing algorithm for documents returned as text/html:
   578 #
   579 # For documents that come to us as text/html,
   580 #
   581 #  1. check if there's a doctype
   582 #  2. if there is a doctype, parse/validate against that DTD
   583 #  3. if no doctype, check for an xmlns= attribute on the first element, or
   584 #     XML declaration
   585 #  4. if no doctype and XML mode, check for XML well-formedness
   586 #  5. otherwise, punt.
   587 #
   588 
   589 #
   590 # Override DOCTYPE if user asked for it.
   591 if ($File->{Opt}->{DOCTYPE}) {
   592     if ($File->{Opt}->{DOCTYPE} !~ /(?:Inline|detect)/i) {
   593         $File = &override_doctype($File);
   594     }
   595     else {
   596 
   597         # Get rid of inline|detect for easy truth value checking later
   598         $File->{Opt}->{DOCTYPE} = '';
   599     }
   600 }
   601 
   602 # Try to extract a DOCTYPE or xmlns.
   603 $File = &preparse_doctype($File);
   604 
   605 if ($File->{Opt}->{DOCTYPE} eq "HTML5") {
   606     $File->{DOCTYPE} = "HTML5";
   607     $File->{Version} = $File->{DOCTYPE};
   608 }
   609 
   610 set_parse_mode($File, $CFG);
   611 
   612 #
   613 # Sanity check Charset information and add any warnings necessary.
   614 $File = &charset_conflicts($File);
   615 
   616 # before we start the parsing, clean slate
   617 $File->{'Is Valid'} = TRUE;
   618 $File->{Errors}     = [];
   619 $File->{WF_Errors}  = [];
   620 
   621 if (($File->{DOCTYPE} eq "HTML5") or ($File->{DOCTYPE} eq "XHTML5")) {
   622     if ($CFG->{External}->{HTML5}) {
   623         $File = &html5_validate($File);
   624         &add_warning(
   625             'W00',
   626             {   W00_experimental_name => "HTML5 Conformance Checker",
   627                 W00_experimental_URI  => "feedback.html"
   628             }
   629         );
   630     }
   631     else {
   632         $File->{'Error Flagged'} = TRUE;
   633         my $tmpl = &get_error_template($File);
   634         $tmpl->param(fatal_no_checker      => TRUE);
   635         $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
   636     }
   637 }
   638 elsif (($File->{DOCTYPE} eq '') and
   639     (($File->{Root} eq "svg") or @{$File->{Namespaces}} > 1))
   640 {
   641 
   642     # we send doctypeless SVG, or any doctypeless XML document with multiple
   643     # namespaces found, to a different engine. WARNING this is experimental.
   644     if ($CFG->{External}->{CompoundXML}) {
   645         $File = &compoundxml_validate($File);
   646         &add_warning(
   647             'W00',
   648             {   W00_experimental_name => "validator.nu Conformance Checker",
   649                 W00_experimental_URI  => "feedback.html"
   650             }
   651         );
   652     }
   653 }
   654 else {
   655     $File = &dtd_validate($File);
   656 }
   657 &abort_if_error_flagged($File);
   658 if (&is_xml($File)) {
   659     if ($File->{DOCTYPE} eq "HTML5") {
   660 
   661         # $File->{DOCTYPE} = "XHTML5";
   662         # $File->{Version} = "XHTML5";
   663     }
   664     else {
   665 
   666         # XMLWF check can be slow, skip if we already know the doc can't pass.
   667         # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9899
   668         $File = &xmlwf($File) if $File->{'Is Valid'};
   669     }
   670     &abort_if_error_flagged($File);
   671 }
   672 
   673 #
   674 # Force "XML" if type is an XML type and an FPI was not found.
   675 # Otherwise set the type to be the FPI.
   676 if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') {
   677     $File->{Version} = 'XML';
   678 }
   679 else {
   680     $File->{Version} ||= $File->{DOCTYPE};
   681 }
   682 
   683 #
   684 # Get the pretty text version of the FPI if a mapping exists.
   685 if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) {
   686     $File->{Version} = $prettyver;
   687 }
   688 
   689 #
   690 # check the received mime type against Allowed mime types
   691 if ($File->{ContentType}) {
   692     my @allowedMediaType =
   693         split(/\s+/,
   694         $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || '');
   695     my $usedCTisAllowed;
   696     if (scalar @allowedMediaType) {
   697         $usedCTisAllowed = FALSE;
   698         foreach (@allowedMediaType) {
   699             $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType});
   700         }
   701     }
   702     else {
   703 
   704         # wedon't know what media type is recommended, so better shut up
   705         $usedCTisAllowed = TRUE;
   706     }
   707     if (!$usedCTisAllowed) {
   708         &add_warning(
   709             'W23',
   710             {   W23_type => $File->{ContentType},
   711                 W23_type_pref =>
   712                     $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred},
   713                 w23_doctype => $File->{Version}
   714             }
   715         );
   716     }
   717 }
   718 
   719 #
   720 # Warn about unknown, incorrect, or missing Namespaces.
   721 if ($File->{Namespace}) {
   722     my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE;
   723 
   724     if (&is_xml($File)) {
   725         if ($ns eq $File->{Namespace}) {
   726             &add_warning(
   727                 'W10',
   728                 {   W10_ns   => $File->{Namespace},
   729                     W10_type => $File->{Type},
   730                 }
   731             );
   732         }
   733     }
   734     elsif ($File->{DOCTYPE} ne 'HTML5') {
   735         &add_warning(
   736             'W11',
   737             {   W11_ns      => $File->{Namespace},
   738                 w11_doctype => $File->{DOCTYPE}
   739             }
   740         );
   741     }
   742 }
   743 else {
   744     if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) {
   745         &add_warning('W12', {});
   746     }
   747 }
   748 
   749 ## if invalid content, AND if requested, pass through tidy
   750 if (!$File->{'Is Valid'} && $File->{Opt}->{'Show Tidy'}) {
   751     eval {
   752         local $SIG{__DIE__} = undef;
   753         require HTML::Tidy;
   754         my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}});
   755         my $cleaned = $tidy->clean(join("\n", @{$File->{Content}}));
   756         $cleaned = Encode::decode_utf8($cleaned);
   757         $File->{Tidy} = $cleaned;
   758     };
   759     if ($@) {
   760         (my $errmsg = $@) =~ s/ at .*//s;
   761         &add_warning('W29', {W29_msg => $errmsg});
   762     }
   763 }
   764 
   765 my %templates = (
   766     earl => ['earl_xml.tmpl', default_escape => 'HTML'],
   767     n3   => ['earl_n3.tmpl'],
   768     json => ['json_output.tmpl'],
   769     ucn  => ['ucn_output.tmpl'],
   770 );
   771 my $template = $templates{$File->{Opt}->{Output}};
   772 if ($template) {
   773     my $tname = shift(@$template);
   774     my $tmpl = &get_template($File, $tname, @$template);
   775     $template = $tmpl;
   776 }
   777 elsif ($File->{Opt}->{Output} eq 'soap12') {
   778     if ($CFG->{'Enable SOAP'} != 1) {
   779 
   780         # API disabled - ideally this should have been sent before performing
   781         # validation...
   782         print CGI::header(
   783             -status           => 503,
   784             -content_language => "en",
   785             -type             => "text/html",
   786             -charset          => "utf-8"
   787         );
   788         $template = &get_template($File, 'soap_disabled.tmpl');
   789     }
   790     else {
   791         $template = &get_template($File, 'soap_output.tmpl');
   792     }
   793 }
   794 else {
   795     $template = &get_template($File, 'result.tmpl');
   796 }
   797 
   798 &prep_template($File, $template);
   799 &fin_template($File, $template);
   800 
   801 $template->param(tidy_output   => $File->{Tidy});
   802 $template->param(file_source   => &source($File))
   803     if ($template->param('opt_show_source') or
   804     ($File->{'Is Upload'}) or
   805     ($File->{'Direct Input'}));
   806 
   807 if ($File->{Opt}->{Output} eq 'json') {
   808 
   809     # No JSON escaping in HTML::Template (and "JS" is not the right thing here)
   810     my $json = JSON->new();
   811     $json->allow_nonref(TRUE);
   812     if (my $msgs = $template->param("file_errors")) {
   813         for my $msg (@$msgs) {
   814             for my $key (qw(msg expl)) {
   815                 $msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key};
   816             }
   817 
   818             # Drop non-numeric char indicators from output, e.g.
   819             # "> 80" for some XML parse error ones (see the non-structured
   820             # XML::LibXML code branch in XML preparsing below).
   821             if ($msg->{char} && $msg->{char} !~ /^\d+$/) {
   822                 delete($msg->{char});
   823             }
   824         }
   825     }
   826 }
   827 
   828 # transcode output from perl's internal to utf-8 and output
   829 print Encode::encode('UTF-8', $template->output);
   830 
   831 #
   832 # Get rid of $File object and exit.
   833 undef $File;
   834 exit;
   835 
   836 #############################################################################
   837 # Subroutine definitions
   838 #############################################################################
   839 
   840 sub get_template ($$;@)
   841 {
   842     my ($File, $fname, @opts) = @_;
   843     if (!$File->{_Templates}->{$fname}) {
   844         my $tmpl = HTML::Template->new(
   845             %{$File->{Template_Defaults}},
   846             filename => $fname,
   847             @opts
   848         );
   849         $tmpl->param(env_home_page     => $File->{Env}->{'Home Page'});
   850         $tmpl->param(validator_version => $VERSION);
   851         $File->{_Templates}->{$fname} = $tmpl;
   852     }
   853     return $File->{_Templates}->{$fname};
   854 }
   855 
   856 sub get_error_template ($;@)
   857 {
   858     my ($File, @opts) = @_;
   859     my $fname = 'fatal-error.tmpl';
   860     if ($File->{Opt}->{Output} eq 'soap12') {
   861         $fname = 'soap_fault.tmpl';
   862     }
   863     elsif ($File->{Opt}->{Output} eq 'ucn') {
   864         $fname = 'ucn_fault.tmpl';
   865     }
   866     return &get_template($File, $fname, @opts);
   867 }
   868 
   869 # TODO: need to bring in fixes from html5_validate() here
   870 sub compoundxml_validate (\$)
   871 {
   872     my $File = shift;
   873     my $ua = W3C::Validator::UserAgent->new($CFG, $File);
   874 
   875     push(
   876         @{$File->{Parsers}},
   877         {   name    => "Compound XML",
   878             link    => "http://qa-dev.w3.org/",    # TODO?
   879             type    => "",
   880             options => ""
   881         }
   882     );
   883 
   884     my $url = URI->new($CFG->{External}->{CompoundXML});
   885     $url->query("out=xml");
   886 
   887     my $req = HTTP::Request->new(POST => $url);
   888 
   889     if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) {
   890 
   891         # Doctype or charset overridden, need to use $File->{Content} in UTF-8
   892         # because $File->{Bytes} is not affected by the overrides.  This will
   893         # most likely be a source of errors about internal/actual charset
   894         # differences as long as our transcoding process does not "fix" the
   895         # charset info in XML declaration and meta http-equiv (any others?).
   896         if ($File->{'Direct Input'})
   897         {    # sane default when using html5 validator by direct input
   898             $req->content_type("application/xml; charset=UTF-8");
   899         }
   900         else {
   901             $req->content_type("$File->{ContentType}; charset=UTF-8");
   902         }
   903         $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}})));
   904     }
   905     else {
   906 
   907         # Pass original bytes, Content-Type and charset as-is.
   908         # We trust that our and validator.nu's interpretation of line numbers
   909         # is the same (regardless of EOL chars used in the document).
   910 
   911         my @content_type = ($File->{ContentType} => undef);
   912         push(@content_type, charset => $File->{Charset}->{HTTP})
   913             if $File->{Charset}->{HTTP};
   914 
   915         $req->content_type(
   916             HTTP::Headers::Util::join_header_words(@content_type));
   917         $req->content_ref(\$File->{Bytes});
   918     }
   919 
   920     $req->content_language($File->{ContentLang}) if $File->{ContentLang};
   921 
   922     # Intentionally using direct header access instead of $req->last_modified
   923     $req->header('Last-Modified', $File->{Modified}) if $File->{Modified};
   924 
   925     # If not in debug mode, gzip the request (LWP >= 5.817)
   926     eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug};
   927 
   928     my $res = $ua->request($req);
   929     if (!$res->is_success()) {
   930         $File->{'Error Flagged'} = TRUE;
   931         my $tmpl = &get_error_template($File);
   932         $tmpl->param(fatal_no_checker      => TRUE);
   933         $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
   934         $tmpl->param(fatal_checker_error   => $res->status_line());
   935     }
   936     else {
   937         my $content = &get_content($File, $res);
   938         return $File if $File->{'Error Flagged'};
   939 
   940         # and now we parse according to
   941         # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output
   942         # I wish we could use XML::LibXML::Reader here. but SHAME on those
   943         # major unix distributions still shipping with libxml2 2.6.16… 4 years
   944         # after its release
   945         # …and we could use now as we require libxml2 >= 2.6.21 anyway…
   946         my $xml_reader = XML::LibXML->new();
   947         $xml_reader->base_uri($res->base());
   948 
   949         my $xmlDOM;
   950         eval { $xmlDOM = $xml_reader->parse_string($content); };
   951         if ($@) {
   952             my $errmsg = $@;
   953             $File->{'Error Flagged'} = TRUE;
   954             my $tmpl = &get_error_template($File);
   955             $tmpl->param(fatal_no_checker      => TRUE);
   956             $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
   957             $tmpl->param(fatal_checker_error   => $errmsg);
   958             return $File;
   959         }
   960         my @nodelist      = $xmlDOM->getElementsByTagName("messages");
   961         my $messages_node = $nodelist[0];
   962         my @message_nodes = $messages_node->childNodes;
   963         foreach my $message_node (@message_nodes) {
   964             my $message_type = $message_node->localname;
   965             my ($err, $xml_error_msg, $xml_error_expl);
   966 
   967             if ($message_type eq "error") {
   968                 $err->{type} = "E";
   969                 $File->{'Is Valid'} = FALSE;
   970             }
   971             elsif ($message_type eq "info") {
   972 
   973                 # by default - we find warnings in the type attribute (below)
   974                 $err->{type} = "I";
   975             }
   976             if ($message_node->hasAttributes()) {
   977                 my @attributelist = $message_node->attributes();
   978                 foreach my $attribute (@attributelist) {
   979                     if ($attribute->name eq "type") {
   980                         if (($attribute->getValue() eq "warning") and
   981                             ($message_type eq "info"))
   982                         {
   983                             $err->{type} = "W";
   984                         }
   985 
   986                     }
   987                     if ($attribute->name eq "last-column") {
   988                         $err->{char} = $attribute->getValue();
   989                     }
   990                     if ($attribute->name eq "last-line") {
   991                         $err->{line} = $attribute->getValue();
   992                     }
   993 
   994                 }
   995             }
   996             my @child_nodes = $message_node->childNodes;
   997             foreach my $child_node (@child_nodes) {
   998                 if ($child_node->localname eq "message") {
   999                     $xml_error_msg = $child_node->toString();
  1000                     $xml_error_msg =~ s,</?[^>]*>,,gsi;
  1001                 }
  1002                 if ($child_node->localname eq "elaboration") {
  1003                     $xml_error_expl = $child_node->toString();
  1004                     $xml_error_expl =~ s,</?elaboration>,,gi;
  1005                     $xml_error_expl =
  1006                         "\n<div class=\"ve xml\">$xml_error_expl</div>\n";
  1007                 }
  1008             }
  1009 
  1010             # formatting the error message for output
  1011             $err->{src}  = "" if $err->{uri};    # TODO...
  1012             $err->{num}  = 'validator.nu';
  1013             $err->{msg}  = $xml_error_msg;
  1014             $err->{expl} = $xml_error_expl;
  1015 
  1016             if ($err->{msg} =~
  1017                 /Using the preset for (.*) based on the root namespace/)
  1018             {
  1019                 $File->{DOCTYPE} = $1;
  1020             }
  1021             else {
  1022                 push @{$File->{Errors}}, $err;
  1023             }
  1024 
  1025             # @@ TODO message explanation / elaboration
  1026         }
  1027     }
  1028     return $File;
  1029 }
  1030 
  1031 sub html5_validate (\$)
  1032 {
  1033     my $File = shift;
  1034     my $ua = W3C::Validator::UserAgent->new($CFG, $File);
  1035 
  1036     push(
  1037         @{$File->{Parsers}},
  1038         {   name    => "validator.nu",
  1039             link    => "http://validator.nu/",
  1040             type    => "HTML5",
  1041             options => ""
  1042         }
  1043     );
  1044 
  1045     my $url = URI->new($CFG->{External}->{HTML5});
  1046     $url->query("out=xml");
  1047 
  1048     my $req = HTTP::Request->new(POST => $url);
  1049     my $ct = &is_xml($File) ? "application/xhtml+xml" : "text/html";
  1050 
  1051     if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override} ||
  1052         $File->{'Direct Input'})
  1053     {
  1054 
  1055         # Doctype or charset overridden, need to use $File->{Content} in UTF-8
  1056         # because $File->{Bytes} is not affected by the  overrides.  Note that
  1057         # direct input is always considered an override here.
  1058 
  1059         &override_charset($File, "UTF-8");
  1060 
  1061         $ct = $File->{ContentType} unless $File->{'Direct Input'};
  1062         my @ct = ($ct => undef, charset => "UTF-8");
  1063         $ct = HTTP::Headers::Util::join_header_words(@ct);
  1064 
  1065         $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}})));
  1066     }
  1067     else {
  1068 
  1069         # Pass original bytes, Content-Type and charset as-is.
  1070         # We trust that our and validator.nu's interpretation of line numbers
  1071         # is the same later when displaying error contexts (regardless of EOL
  1072         # chars used in the document).
  1073 
  1074         my @ct = ($File->{ContentType} => undef);
  1075         push(@ct, charset => $File->{Charset}->{HTTP})
  1076             if $File->{Charset}->{HTTP};
  1077         $ct = HTTP::Headers::Util::join_header_words(@ct);
  1078 
  1079         $req->content_ref(\$File->{Bytes});
  1080     }
  1081     $req->content_type($ct);
  1082 
  1083     $req->content_language($File->{ContentLang}) if $File->{ContentLang};
  1084 
  1085     # Intentionally using direct header access instead of $req->last_modified
  1086     # (the latter takes seconds since epoch, but $File->{Modified} is an already
  1087     # formatted string).
  1088     $req->header('Last-Modified', $File->{Modified}) if $File->{Modified};
  1089 
  1090     # Use gzip in non-debug, remote HTML5 validator mode (LWP >= 5.817).
  1091     if (!$File->{Opt}->{Debug} &&
  1092         $url->host() !~ /^(?:localhost|127(?:\.\d+){3}|.*\.localdomain)$/i)
  1093     {
  1094         eval { $req->encode("gzip"); };
  1095     }
  1096     else {
  1097         $req->header('Accept-Encoding', 'identity');
  1098     }
  1099 
  1100     my $res = $ua->request($req);
  1101     if (!$res->is_success()) {
  1102         $File->{'Error Flagged'} = TRUE;
  1103         my $tmpl = &get_error_template($File);
  1104         $tmpl->param(fatal_no_checker      => TRUE);
  1105         $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
  1106         $tmpl->param(fatal_checker_error   => $res->status_line());
  1107     }
  1108     else {
  1109         my $content = &get_content($File, $res);
  1110         return $File if $File->{'Error Flagged'};
  1111 
  1112         # and now we parse according to
  1113         # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output
  1114         # I wish we could use XML::LibXML::Reader here. but SHAME on those
  1115         # major unix distributions still shipping with libxml2 2.6.16… 4 years
  1116         # after its release
  1117         my $xml_reader = XML::LibXML->new();
  1118         $xml_reader->base_uri($res->base());
  1119 
  1120         my $xmlDOM;
  1121         eval { $xmlDOM = $xml_reader->parse_string($content); };
  1122         if ($@) {
  1123             my $errmsg = $@;
  1124             $File->{'Error Flagged'} = TRUE;
  1125             my $tmpl = &get_error_template($File);
  1126             $tmpl->param(fatal_no_checker      => TRUE);
  1127             $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
  1128             $tmpl->param(fatal_checker_error   => $errmsg);
  1129             return $File;
  1130         }
  1131         my @nodelist      = $xmlDOM->getElementsByTagName("messages");
  1132         my $messages_node = $nodelist[0];
  1133         my @message_nodes = $messages_node->childNodes;
  1134         foreach my $message_node (@message_nodes) {
  1135             my $message_type = $message_node->localname;
  1136             my ($html5_error_msg, $html5_error_expl);
  1137             my $err = {};
  1138 
  1139             # TODO: non-document errors should receive different/better
  1140             # treatment, but this is better than hiding all problems for now
  1141             # (#6747)
  1142             if ($message_type eq "error" ||
  1143                 $message_type eq "non-document-error")
  1144             {
  1145                 $err->{type} = "E";
  1146                 $File->{'Is Valid'} = FALSE;
  1147             }
  1148             elsif ($message_type eq "info") {
  1149 
  1150                 # by default - we find warnings in the type attribute (below)
  1151                 $err->{type} = "I";
  1152             }
  1153             if ($message_node->hasAttributes()) {
  1154                 my @attributelist = $message_node->attributes();
  1155                 foreach my $attribute (@attributelist) {
  1156                     if ($attribute->name eq "type") {
  1157                         if (($attribute->getValue() eq "warning") and
  1158                             ($message_type eq "info"))
  1159                         {
  1160                             $err->{type} = "W";
  1161                         }
  1162 
  1163                     }
  1164                     elsif ($attribute->name eq "last-column") {
  1165                         $err->{char} = $attribute->getValue();
  1166                     }
  1167                     elsif ($attribute->name eq "last-line") {
  1168                         $err->{line} = $attribute->getValue();
  1169                     }
  1170                     elsif ($attribute->name eq "url") {
  1171                         &set_error_uri($err, $attribute->getValue());
  1172                     }
  1173                 }
  1174             }
  1175             my @child_nodes = $message_node->childNodes;
  1176             foreach my $child_node (@child_nodes) {
  1177                 if ($child_node->localname eq "message") {
  1178                     $html5_error_msg = $child_node->textContent();
  1179                 }
  1180                 elsif ($child_node->localname eq "elaboration") {
  1181                     $html5_error_expl = $child_node->toString();
  1182                     $html5_error_expl =~ s,</?elaboration>,,gi;
  1183                     $html5_error_expl =
  1184                         "\n<div class=\"ve html5\">$html5_error_expl</div>\n";
  1185                 }
  1186             }
  1187 
  1188             # formatting the error message for output
  1189 
  1190             # TODO: set $err->{src} from extract if we got an URI for the error:
  1191             # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output#The_extract_Element
  1192             # For now, set it directly to empty to prevent report_errors() from
  1193             # trying to populate it from our doc.
  1194             $err->{src} = "" if $err->{uri};
  1195 
  1196             $err->{num}  = 'html5';
  1197             $err->{msg}  = $html5_error_msg;
  1198             $err->{expl} = $html5_error_expl;
  1199             push @{$File->{Errors}}, $err;
  1200 
  1201             # @@ TODO message explanation / elaboration
  1202         }
  1203     }
  1204     return $File;
  1205 }
  1206 
  1207 sub dtd_validate (\$)
  1208 {
  1209     my $File   = shift;
  1210     my $opensp = SGML::Parser::OpenSP->new();
  1211 
  1212     #
  1213     # By default, use SGML catalog file and SGML Declaration.
  1214     my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc');
  1215 
  1216     # default parsing options
  1217     my @spopt = qw(valid non-sgml-char-ref no-duplicate);
  1218 
  1219     #
  1220     # Switch to XML semantics if file is XML.
  1221     if (&is_xml($File)) {
  1222         $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc');
  1223         push(@spopt, 'xml');
  1224     }
  1225     else {
  1226 
  1227         # add warnings for shorttags
  1228         push(@spopt, 'min-tag');
  1229     }
  1230 
  1231     push(
  1232         @{$File->{Parsers}},
  1233         {   name    => "OpenSP",
  1234             link    => "http://openjade.sourceforge.net/",
  1235             type    => "SGML/XML",
  1236             options => join(" ", @spopt)
  1237         }
  1238     );
  1239 
  1240     #
  1241     # Parser configuration
  1242     $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library});
  1243     $opensp->catalogs($catalog);
  1244     $opensp->show_error_numbers(1);
  1245     $opensp->warnings(@spopt);
  1246 
  1247     #
  1248     # Restricted file reading is disabled on Win32 for the time
  1249     # being since neither SGML::Parser::OpenSP nor check auto-
  1250     # magically set search_dirs to include the temp directory
  1251     # so restricted file reading would defunct the Validator.
  1252     $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32';
  1253 
  1254     my $h;    # event handler
  1255     if ($File->{Opt}->{Outline}) {
  1256         $h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG);
  1257     }
  1258     else {
  1259         $h = W3C::Validator::EventHandler->new($opensp, $File, $CFG);
  1260     }
  1261 
  1262     $opensp->handler($h);
  1263     $opensp->parse_string(join "\n", @{$File->{Content}});
  1264 
  1265     # Make sure there are no circular references, otherwise the script
  1266     # would leak memory until mod_perl unloads it which could take some
  1267     # time. @@FIXME It's probably overly careful though.
  1268     $opensp->handler(undef);
  1269     undef $h->{_parser};
  1270     undef $h->{_file};
  1271     undef $h;
  1272     undef $opensp;
  1273 
  1274     #
  1275     # Set Version to be the FPI initially.
  1276     $File->{Version} = $File->{DOCTYPE};
  1277     return $File;
  1278 }
  1279 
  1280 sub xmlwf (\$)
  1281 {
  1282 
  1283     # we should really be using a SAX ErrorHandler, but I can't find a way to
  1284     # make it work with XML::LibXML::SAX::Parser... ** FIXME **
  1285     # ditto, we should try using W3C::Validator::EventHandler, but it's badly
  1286     # linked to opensp at the moment
  1287 
  1288     my $File      = shift;
  1289     my $xmlparser = XML::LibXML->new();
  1290     $xmlparser->line_numbers(1);
  1291     $xmlparser->validation(0);
  1292     $xmlparser->base_uri($File->{URI})
  1293         unless ($File->{'Direct Input'} || $File->{'Is Upload'});
  1294 
  1295     push(
  1296         @{$File->{Parsers}},
  1297         {   name    => "libxml2",
  1298             link    => "http://xmlsoft.org/",
  1299             type    => "XML",
  1300             options => ""
  1301         }
  1302     );
  1303 
  1304     # Restrict file reading similar to what SGML::Parser::OpenSP does.  Note
  1305     # that all inputs go through the callback so if we were passing a
  1306     # URI/filename to the parser, it would be affected as well and would break
  1307     # fetching the initial document.  As long as we pass the doc as string,
  1308     # this should work.
  1309     my $cb = XML::LibXML::InputCallback->new();
  1310     $cb->register_callbacks([\&xml_jail_match, sub { }, sub { }, sub { }]);
  1311     $xmlparser->input_callbacks($cb);
  1312 
  1313     &override_charset($File, "UTF-8");
  1314 
  1315     eval { $xmlparser->parse_string(join("\n", @{$File->{Content}})); };
  1316 
  1317     if (ref($@)) {
  1318 
  1319         # handle a structured error (XML::LibXML::Error object)
  1320 
  1321         my $err_obj = $@;
  1322         while ($err_obj) {
  1323             my $err = {};
  1324             &set_error_uri($err, $err_obj->file());
  1325             $err->{src}  = &ent($err_obj->context()) if $err->{uri};
  1326             $err->{line} = $err_obj->line();
  1327             $err->{char} = $err_obj->column();
  1328             $err->{num}  = "libxml2-" . $err_obj->code();
  1329             $err->{type} = "E";
  1330             $err->{msg}  = $err_obj->message();
  1331 
  1332             $err_obj = $err_obj->_prev();
  1333 
  1334             unshift(@{$File->{WF_Errors}}, $err);
  1335         }
  1336     }
  1337     elsif ($@) {
  1338         my $xmlwf_errors      = $@;
  1339         my $xmlwf_error_line  = undef;
  1340         my $xmlwf_error_col   = undef;
  1341         my $xmlwf_error_msg   = undef;
  1342         my $got_error_message = undef;
  1343         my $got_quoted_line   = undef;
  1344         foreach my $msg_line (split "\n", $xmlwf_errors) {
  1345 
  1346             $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g;
  1347             $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{};
  1348 
  1349             # first we get the actual error message
  1350             if (!$got_error_message &&
  1351                 $msg_line =~ /^(:\d+:)( parser error : .*)/)
  1352             {
  1353                 $xmlwf_error_line = $1;
  1354                 $xmlwf_error_msg  = $2;
  1355                 $xmlwf_error_line =~ s/:(\d+):/$1/;
  1356                 $xmlwf_error_msg  =~ s/ parser error :/XML Parsing Error: /;
  1357                 $got_error_message = 1;
  1358             }
  1359 
  1360             # then we skip the second line, which shows the context
  1361             # (we don't use that)
  1362             elsif ($got_error_message && !$got_quoted_line) {
  1363                 $got_quoted_line = 1;
  1364             }
  1365 
  1366             # we now take the third line, with the pointer to the error's
  1367             # column
  1368             elsif (($msg_line =~ /(\s+)\^/) and
  1369                 $got_error_message and
  1370                 $got_quoted_line)
  1371             {
  1372                 $xmlwf_error_col = length($1);
  1373             }
  1374 
  1375             #  cleanup for a number of bugs for the column number
  1376             if (defined($xmlwf_error_col)) {
  1377                 if ((   my $l =
  1378                         length($File->{Content}->[$xmlwf_error_line - 1])
  1379                     ) < $xmlwf_error_col
  1380                     )
  1381                 {
  1382 
  1383                     # http://bugzilla.gnome.org/show_bug.cgi?id=434196
  1384                     #warn("Warning: reported error column larger than line length " .
  1385                     #     "($xmlwf_error_col > $l) in $File->{URI} line " .
  1386                     #     "$xmlwf_error_line, libxml2 bug? Resetting to line length.");
  1387                     $xmlwf_error_col = $l;
  1388                 }
  1389                 elsif ($xmlwf_error_col == 79) {
  1390 
  1391                     # working around an apparent odd limitation of libxml which
  1392                     # only gives context for lines up to 80 chars
  1393                     # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420
  1394                     # http://bugzilla.gnome.org/show_bug.cgi?id=424017
  1395                     $xmlwf_error_col = "> 80";
  1396 
  1397                     # non-int line number will trigger the proper behavior in
  1398                     # report_error
  1399                 }
  1400             }
  1401 
  1402             # when we have all the info (one full error message), proceed
  1403             # and move on to the next error
  1404             if ((defined $xmlwf_error_line) and
  1405                 (defined $xmlwf_error_col) and
  1406                 (defined $xmlwf_error_msg))
  1407             {
  1408 
  1409                 # Reinitializing for the next batch of 3 lines
  1410                 $got_error_message = undef;
  1411                 $got_quoted_line   = undef;
  1412 
  1413                 # formatting the error message for output
  1414                 my $err = {};
  1415 
  1416                 # TODO: set_error_uri() (need test case)
  1417                 $err->{src}  = "" if $err->{uri};    # TODO...
  1418                 $err->{line} = $xmlwf_error_line;
  1419                 $err->{char} = $xmlwf_error_col;
  1420                 $err->{num}  = 'xmlwf';
  1421                 $err->{type} = "E";
  1422                 $err->{msg}  = $xmlwf_error_msg;
  1423 
  1424                 push(@{$File->{WF_Errors}}, $err);
  1425                 $xmlwf_error_line = undef;
  1426                 $xmlwf_error_col  = undef;
  1427                 $xmlwf_error_msg  = undef;
  1428             }
  1429         }
  1430     }
  1431 
  1432     $File->{'Is Valid'} = FALSE if @{$File->{WF_Errors}};
  1433     return $File;
  1434 }
  1435 
  1436 #
  1437 # Generate HTML report.
  1438 sub prep_template ($$)
  1439 {
  1440     my $File = shift;
  1441     my $T    = shift;
  1442 
  1443     #
  1444     # XML mode...
  1445     $T->param(is_xml => &is_xml($File));
  1446 
  1447     #
  1448     # Upload?
  1449     $T->param(is_upload => $File->{'Is Upload'});
  1450 
  1451     #
  1452     # Direct Input?
  1453     $T->param(is_direct_input => $File->{'Direct Input'});
  1454 
  1455     #
  1456     # The URI...
  1457     $T->param(file_uri => $File->{URI});
  1458 
  1459     #
  1460     # HTTPS note?
  1461     $T->param(file_https_note => $File->{'Is Upload'} ||
  1462             $File->{'Direct Input'} ||
  1463             URI->new($File->{URI})->secure());
  1464 
  1465     #
  1466     # Set URL for page title.
  1467     $T->param(page_title_url => $File->{URI});
  1468 
  1469     #
  1470     # Metadata...
  1471     $T->param(file_modified    => $File->{Modified});
  1472     $T->param(file_server      => $File->{Server});
  1473     $T->param(file_size        => $File->{Size});
  1474     $T->param(file_contenttype => $File->{ContentType});
  1475     $T->param(file_charset     => $File->{Charset}->{Use});
  1476     $T->param(file_doctype     => $File->{DOCTYPE});
  1477 
  1478     #
  1479     # Output options...
  1480     $T->param(opt_show_source  => $File->{Opt}->{'Show Source'});
  1481     $T->param(opt_show_tidy    => $File->{Opt}->{'Show Tidy'});
  1482     $T->param(opt_show_outline => $File->{Opt}->{Outline});
  1483     $T->param(opt_verbose      => $File->{Opt}->{Verbose});
  1484     $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'});
  1485     $T->param(opt_no200        => $File->{Opt}->{No200});
  1486 
  1487     # Root Element
  1488     $T->param(root_element => $File->{Root});
  1489 
  1490     # Namespaces...
  1491     $T->param(file_namespace => $File->{Namespace});
  1492 
  1493     # Non-root ones; unique, preserving occurrence order
  1494     my %seen_ns = ();
  1495     $seen_ns{$File->{Namespace}}++ if defined($File->{Namespace});
  1496     my @nss =
  1497         map { $seen_ns{$_}++ == 0 ? {uri => $_} : () } @{$File->{Namespaces}};
  1498     $T->param(file_namespaces => \@nss) if @nss;
  1499 
  1500     if ($File->{Opt}->{DOCTYPE}) {
  1501         my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}";
  1502         $T->param($over_doctype_param => TRUE);
  1503     }
  1504 
  1505     if ($File->{Opt}->{Charset}) {
  1506         my $over_charset_param = "override charset $File->{Opt}->{Charset}";
  1507         $T->param($over_charset_param => TRUE);
  1508     }
  1509 
  1510     # Allow content-negotiation
  1511     if ($File->{Opt}->{'Accept Header'}) {
  1512         $T->param('accept' => $File->{Opt}->{'Accept Header'});
  1513     }
  1514     if ($File->{Opt}->{'Accept-Language Header'}) {
  1515         $T->param(
  1516             'accept-language' => $File->{Opt}->{'Accept-Language Header'});
  1517     }
  1518     if ($File->{Opt}->{'Accept-Charset Header'}) {
  1519         $T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'});
  1520     }
  1521     if ($File->{Opt}->{'User Agent'}) {
  1522         $T->param('user-agent' => $File->{Opt}->{'User Agent'});
  1523     }
  1524     if ($File->{'Error Flagged'}) {
  1525         $T->param(fatal_error => TRUE);
  1526     }
  1527 }
  1528 
  1529 sub fin_template ($$)
  1530 {
  1531     my $File = shift;
  1532     my $T    = shift;
  1533 
  1534     #
  1535     # Set debug info for HTML and SOAP reports.
  1536     if ($DEBUG) {
  1537         my @parsers;
  1538         for my $parser (@{$File->{Parsers}}) {
  1539             my $p = $parser->{name};
  1540             $p .= " (" . $parser->{options} . ")" if $parser->{options};
  1541             push(@parsers, $p);
  1542         }
  1543         $T->param(
  1544             debug => [
  1545                 map({name => $_, value => $ENV{$_}},
  1546                     qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)),
  1547                 {name => 'Content-Encoding',  value => $File->{ContentEnc}},
  1548                 {name => 'Content-Language',  value => $File->{ContentLang}},
  1549                 {name => 'Content-Location',  value => $File->{ContentLoc}},
  1550                 {name => 'Transfer-Encoding', value => $File->{TransferEnc}},
  1551                 {name => 'Parse Mode',        value => $File->{Mode}},
  1552                 {name => 'Parse Mode Factor', value => $File->{ModeChoice}},
  1553                 {name => 'Parsers Used',      value => join(", ", @parsers)},
  1554             ],
  1555         );
  1556     }
  1557 
  1558     $T->param(parsers => $File->{Parsers});
  1559 
  1560     if (!$File->{Doctype} &&
  1561         (!$File->{Version} ||
  1562             $File->{Version} eq 'unknown' ||
  1563             $File->{Version} eq 'SGML')
  1564         )
  1565     {
  1566         my $default_doctype =
  1567             $File->{"Default DOCTYPE"}->{&is_xml($File) ? "XHTML" : "HTML"};
  1568         $T->param(file_version => "$default_doctype");
  1569     }
  1570     else {
  1571         $T->param(file_version => $File->{Version});
  1572     }
  1573     my ($num_errors, $num_warnings, $num_info, $reported_errors) =
  1574         &report_errors($File);
  1575     if ($num_errors + $num_warnings > 0) {
  1576         $T->param(has_errors => 1);
  1577     }
  1578     $T->param(valid_errors_num => $num_errors);
  1579     $num_warnings += scalar @{$File->{Warnings}};
  1580     $T->param(valid_warnings_num => $num_warnings);
  1581     my $number_of_errors   = "";    # textual form of $num_errors
  1582     my $number_of_warnings = "";    # textual form of $num_errors
  1583 
  1584     # The following is a bit hack-ish, but will enable us to have some logic
  1585     # for a human-readable display of the number, with cases for 0, 1, 2 and
  1586     # above (the case of 2 appears to be useful for localization in some
  1587     # languages where the plural is different for 2, and above)
  1588 
  1589     if ($num_errors > 1) {
  1590         $T->param(number_of_errors_is_0 => FALSE);
  1591         $T->param(number_of_errors_is_1 => FALSE);
  1592         if ($num_errors == 2) {
  1593             $T->param(number_of_errors_is_2 => TRUE);
  1594         }
  1595         else {
  1596             $T->param(number_of_errors_is_2 => FALSE);
  1597         }
  1598         $T->param(number_of_errors_is_plural => TRUE);
  1599     }
  1600     elsif ($num_errors == 1) {
  1601         $T->param(number_of_errors_is_0      => FALSE);
  1602         $T->param(number_of_errors_is_1      => TRUE);
  1603         $T->param(number_of_errors_is_2      => FALSE);
  1604         $T->param(number_of_errors_is_plural => FALSE);
  1605     }
  1606     else {    # 0
  1607         $T->param(number_of_errors_is_0      => TRUE);
  1608         $T->param(number_of_errors_is_1      => FALSE);
  1609         $T->param(number_of_errors_is_2      => FALSE);
  1610         $T->param(number_of_errors_is_plural => FALSE);
  1611     }
  1612 
  1613     if ($num_warnings > 1) {
  1614         $T->param(number_of_warnings_is_0 => FALSE);
  1615         $T->param(number_of_warnings_is_1 => FALSE);
  1616         if ($num_warnings == 2) {
  1617             $T->param(number_of_warnings_is_2 => TRUE);
  1618         }
  1619         else {
  1620             $T->param(number_of_warnings_is_2 => FALSE);
  1621         }
  1622         $T->param(number_of_warnings_is_plural => TRUE);
  1623     }
  1624     elsif ($num_warnings == 1) {
  1625         $T->param(number_of_warnings_is_0      => FALSE);
  1626         $T->param(number_of_warnings_is_1      => TRUE);
  1627         $T->param(number_of_warnings_is_2      => FALSE);
  1628         $T->param(number_of_warnings_is_plural => FALSE);
  1629     }
  1630     else {    # 0
  1631         $T->param(number_of_warnings_is_0      => TRUE);
  1632         $T->param(number_of_warnings_is_1      => FALSE);
  1633         $T->param(number_of_warnings_is_2      => FALSE);
  1634         $T->param(number_of_warnings_is_plural => FALSE);
  1635     }
  1636 
  1637     $T->param(file_outline => $File->{heading_outline})
  1638         if $File->{Opt}->{Outline};
  1639 
  1640     $T->param(file_errors => $reported_errors);
  1641     if ($File->{'Is Valid'}) {
  1642         $T->param(VALID        => TRUE);
  1643         $T->param(valid_status => 'Valid');
  1644         &report_valid($File, $T);
  1645     }
  1646     else {
  1647         $T->param(VALID        => FALSE);
  1648         $T->param(valid_status => 'Invalid');
  1649     }
  1650 }
  1651 
  1652 #
  1653 # Output "This page is Valid" report.
  1654 sub report_valid
  1655 {
  1656     my $File = shift;
  1657     my $T    = shift;
  1658 
  1659     unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) {
  1660 
  1661         if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) {
  1662             my $cfg = $CFG->{Types}->{$File->{DOCTYPE}};
  1663             $T->param(badge_uri           => $cfg->{Badge}->{URI});
  1664             $T->param(local_badge_uri     => $cfg->{Badge}->{'Local URI'});
  1665             $T->param(badge_alt_uri       => $cfg->{Badge}->{'Alt URI'});
  1666             $T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'});
  1667             $T->param(badge_alt           => $cfg->{Badge}->{Alt});
  1668             $T->param(badge_rdfa          => $cfg->{Badge}->{RDFa});
  1669             $T->param(badge_h             => $cfg->{Badge}->{Height});
  1670             $T->param(badge_w             => $cfg->{Badge}->{Width});
  1671             $T->param(badge_onclick       => $cfg->{Badge}->{OnClick});
  1672             $T->param(badge_tagc => $cfg->{'Parse Mode'} eq 'XML' ? ' /' : '');
  1673         }
  1674     }
  1675     elsif (defined $File->{Tentative}) {
  1676         $T->param(is_tentative => TRUE);
  1677     }
  1678 
  1679     if ($File->{XMLWF_ONLY}) {
  1680         $T->param(xmlwf_only => TRUE);
  1681     }
  1682     my $thispage = self_url_file($File);
  1683     $T->param(file_thispage => $thispage);
  1684 }
  1685 
  1686 #
  1687 # Add a warning message to the output.
  1688 sub add_warning ($$)
  1689 {
  1690     my $WID    = shift;
  1691     my $params = shift;
  1692 
  1693     push @{$File->{Warnings}}, $WID;
  1694 
  1695     my %tmplparams = (
  1696         $WID          => TRUE,
  1697         have_warnings => TRUE,
  1698         %$params,
  1699     );
  1700     for my $tmpl (qw(result fatal-error soap_output ucn_output)) {
  1701         &get_template($File, "$tmpl.tmpl")->param(%tmplparams);
  1702     }
  1703 }
  1704 
  1705 #
  1706 # Proxy authentication requests.
  1707 # Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth).
  1708 sub authenticate
  1709 {
  1710     my $File       = shift;
  1711     my $resource   = shift;
  1712     my $authHeader = shift || {};
  1713 
  1714     my $realm = $resource;
  1715     $realm =~ s([^\w\d.-]*){}g;
  1716 
  1717     while (my ($scheme, $header) = each %$authHeader) {
  1718         my $origrealm = $header->{realm};
  1719         if (not defined $origrealm or $scheme !~ /^(?:basic|digest)$/i) {
  1720             delete($authHeader->{$scheme});
  1721             next;
  1722         }
  1723         $header->{realm} = "$realm-$origrealm";
  1724     }
  1725 
  1726     my $headers = HTTP::Headers->new(Connection => 'close');
  1727     $headers->www_authenticate(%$authHeader);
  1728     $headers = $headers->as_string();
  1729     chomp($headers);
  1730 
  1731     my $tmpl = &get_template($File, 'http_401_authrequired.tmpl');
  1732     $tmpl->param(http_401_headers => $headers);
  1733     $tmpl->param(http_401_url     => $resource);
  1734 
  1735     print Encode::encode('UTF-8', $tmpl->output);
  1736     exit;    # Further interaction will be a new HTTP request.
  1737 }
  1738 
  1739 #
  1740 # Fetch an URL and return the content and selected meta-info.
  1741 sub handle_uri
  1742 {
  1743     my $q    = shift;    # The CGI object.
  1744     my $File = shift;    # The master datastructure.
  1745 
  1746     my $ua = W3C::Validator::UserAgent->new($CFG, $File);
  1747 
  1748     my $uri = URI->new(ref $q ? $q->param('uri') : $q)->canonical();
  1749     $uri->fragment(undef);
  1750 
  1751     if (!$uri->scheme()) {
  1752         local $ENV{URL_GUESS_PATTERN} = '';
  1753         my $guess = URI::Heuristic::uf_uri($uri);
  1754         if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
  1755             $uri = $guess;
  1756         }
  1757         else {
  1758             $uri = URI->new("http://$uri");
  1759         }
  1760     }
  1761 
  1762     unless ($ua->is_protocol_supported($uri)) {
  1763         $File->{'Error Flagged'} = TRUE;
  1764         my $tmpl = &get_error_template($File);
  1765 
  1766         # If uri param is empty (also for empty direct or upload), it's been
  1767         # set to TRUE in sub prepCGI()
  1768         if ($uri->canonical() eq "1") {
  1769             $tmpl->param(fatal_no_content => TRUE);
  1770         }
  1771         else {
  1772             $tmpl->param(fatal_uri_error  => TRUE);
  1773             $tmpl->param(fatal_uri_scheme => $uri->scheme());
  1774         }
  1775         return $File;
  1776     }
  1777 
  1778     return $File unless $ua->uri_ok($uri);
  1779 
  1780     my $req = HTTP::Request->new(GET => $uri);
  1781 
  1782     # if one wants to use the accept, accept-charset and accept-language params
  1783     # in order to trigger specific negotiation
  1784     if ($File->{Opt}->{'Accept Header'}) {
  1785         $req->header(Accept => $File->{Opt}->{'Accept Header'});
  1786     }
  1787     if ($File->{Opt}->{'Accept-Language Header'}) {
  1788         $req->header(
  1789             Accept_Language => $File->{Opt}->{'Accept-Language Header'});
  1790     }
  1791     if ($File->{Opt}->{'Accept-Charset Header'}) {
  1792         $req->header(
  1793             Accept_Charset => $File->{Opt}->{'Accept-Charset Header'});
  1794     }
  1795 
  1796     # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
  1797     # If we're under mod_perl, there is a way around it...
  1798     my $http_auth = $ENV{HTTP_AUTHORIZATION};
  1799     eval {
  1800         local $SIG{__DIE__} = undef;
  1801         my $auth =
  1802             Apache2::RequestUtil->request()->headers_in()->{Authorization};
  1803         $http_auth = $auth if $auth;
  1804     } if (IS_MODPERL2() && !$http_auth);
  1805 
  1806     # If we got a Authorization header, the client is back at it after being
  1807     # prompted for a password so we insert the header as is in the request.
  1808     $req->headers->header(Authorization => $http_auth) if $http_auth;
  1809 
  1810     my $res = $ua->request($req);
  1811 
  1812     return $File if $File->{'Error Flagged'};    # Redirect IP rejected?
  1813 
  1814     unless ($res->code == 200 or $File->{Opt}->{'No200'}) {
  1815         if ($res->code == 401) {
  1816             my %auth = $res->www_authenticate();    # HTTP::Headers::Auth
  1817             &authenticate($File, $res->request->uri, \%auth);
  1818         }
  1819         else {
  1820             $File->{'Error Flagged'} = TRUE;
  1821 
  1822             my $no200url = undef;
  1823             if (!$File->{Opt}->{No200}) {
  1824 
  1825                 # $File->{URI} not set yet; setting it non-local has side
  1826                 # effects
  1827                 local $File->{URI} = $uri->as_string;
  1828                 local $File->{Opt}->{No200} = TRUE;
  1829                 $no200url = &self_url_file($File);
  1830             }
  1831 
  1832             my $warning = $res->header("Client-Warning");
  1833             if ($warning && $warning =~ /Internal response/i) {
  1834 
  1835                 # Response doc generated internally by LWP, no need to show
  1836                 # that info nor to provide error doc validation link to it.
  1837                 $warning  = undef;
  1838                 $no200url = undef;
  1839             }
  1840 
  1841             my $tmpl = &get_error_template($File);
  1842             $tmpl->param(fatal_http_error => TRUE);
  1843             $tmpl->param(fatal_http_uri   => $uri->as_string);
  1844             $tmpl->param(fatal_http_code  => $res->code);
  1845             $tmpl->param(fatal_http_msg   => $res->message);
  1846             $tmpl->param(fatal_http_warn  => $warning);
  1847             $tmpl->param(fatal_http_no200 => $no200url);
  1848             $tmpl->param(fatal_http_dns   => TRUE) if ($res->code == 500);
  1849         }
  1850 
  1851         return $File;
  1852     }
  1853 
  1854     #
  1855     # Enforce Max Recursion level.
  1856     &check_recursion($File, $res);
  1857 
  1858     my ($mode, $ct, $charset) = &parse_content_type(
  1859         $File,
  1860         scalar($res->header('Content-Type')),
  1861         scalar($res->request->uri),
  1862     );
  1863 
  1864     my $content = &get_content($File, $res);
  1865     return $File if $File->{'Error Flagged'};
  1866 
  1867     $File->{Bytes}           = $content;
  1868     $File->{Mode}            = $mode;
  1869     $File->{ContentType}     = $ct;
  1870     $File->{ContentEnc}      = $res->content_encoding;
  1871     $File->{ContentLang}     = $res->content_language;
  1872     $File->{ContentLoc}      = $res->header('Content-Location');
  1873     $File->{TransferEnc}     = $res->header('Client-Transfer-Encoding');
  1874     $File->{Charset}->{HTTP} = lc $charset if defined $charset;
  1875     $File->{Modified}        = $res->header('Last-Modified');
  1876     $File->{Server}          = scalar $res->server;
  1877 
  1878     # TODO: Content-Length is not always set, so either this should
  1879     # be renamed to 'Content-Length' or it should consider more than
  1880     # the Content-Length header.
  1881     $File->{Size}           = scalar $res->content_length;
  1882     $File->{URI}            = scalar $res->request->uri->canonical;
  1883     $File->{'Is Upload'}    = FALSE;
  1884     $File->{'Direct Input'} = FALSE;
  1885 
  1886     return $File;
  1887 }
  1888 
  1889 #
  1890 # Handle uploaded file and return the content and selected meta-info.
  1891 sub handle_file
  1892 {
  1893     my $q    = shift;    # The CGI object.
  1894     my $File = shift;    # The master datastructure.
  1895 
  1896     my $p = $q->param('uploaded_file');
  1897     my $f = $q->upload('uploaded_file');
  1898     if (!defined($f)) {
  1899 
  1900         # Probably not an uploaded file as far as CGI is concerned,
  1901         # treat as a fragment.
  1902         $q->param('fragment', $p);
  1903         return &handle_frag($q, $File);
  1904     }
  1905 
  1906     my $h = $q->uploadInfo($p);
  1907 
  1908     local $/ = undef;    # set line delimiter so that <> reads rest of file
  1909     my $file = <$f>;
  1910 
  1911     my ($mode, $ct, $charset) =
  1912         &parse_content_type($File, $h->{'Content-Type'});
  1913 
  1914     $File->{Bytes}           = $file;
  1915     $File->{Mode}            = $mode;
  1916     $File->{ContentType}     = $ct;
  1917     $File->{Charset}->{HTTP} = lc $charset if defined $charset;
  1918     $File->{Modified}        = $q->http('Last-Modified');
  1919     $File->{Server}          = $q->http('User-Agent');   # Fake a "server". :-)
  1920     $File->{Size}           = $q->http('Content-Length');
  1921     $File->{URI}            = "$p";
  1922     $File->{'Is Upload'}    = TRUE;
  1923     $File->{'Direct Input'} = FALSE;
  1924 
  1925     return $File;
  1926 }
  1927 
  1928 #
  1929 # Handle uploaded file and return the content and selected meta-info.
  1930 sub handle_frag
  1931 {
  1932     my $q    = shift;    # The CGI object.
  1933     my $File = shift;    # The master datastructure.
  1934 
  1935     $File->{Bytes}          = $q->param('fragment');
  1936     $File->{Mode}           = 'TBD';
  1937     $File->{Modified}       = '';
  1938     $File->{Server}         = '';
  1939     $File->{Size}           = '';
  1940     $File->{ContentType}    = '';                           # @@TODO?
  1941     $File->{URI}            = 'upload://Form Submission';
  1942     $File->{'Is Upload'}    = FALSE;
  1943     $File->{'Direct Input'} = TRUE;
  1944     $File->{Charset}->{HTTP} =
  1945         "utf-8";    # by default, the form accepts utf-8 chars
  1946 
  1947     if ($File->{Opt}->{Prefill}) {
  1948 
  1949         # we surround the HTML fragment with some basic document structure
  1950         my $prefill_Template;
  1951         if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') {
  1952             $prefill_Template = &get_template($File, 'prefill_html401.tmpl');
  1953         }
  1954         else {
  1955             $prefill_Template = &get_template($File, 'prefill_xhtml10.tmpl');
  1956         }
  1957         $prefill_Template->param(fragment => $File->{Bytes});
  1958         $File->{Bytes} = $prefill_Template->output();
  1959 
  1960         # Let's force the view source so that the user knows what we've put
  1961         # around their code.
  1962         $File->{Opt}->{'Show Source'} = TRUE;
  1963 
  1964         # Ignore doctype overrides (#5132).
  1965         $File->{Opt}->{DOCTYPE} = 'Inline';
  1966     }
  1967 
  1968     return $File;
  1969 }
  1970 
  1971 #
  1972 # Parse a Content-Type and parameters. Return document type and charset.
  1973 sub parse_content_type
  1974 {
  1975     my $File         = shift;
  1976     my $Content_Type = shift;
  1977     my $url          = shift;
  1978     my $charset      = '';
  1979 
  1980     my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g;
  1981 
  1982     my $mode = $CFG->{MIME}->{$ct} || $ct;
  1983 
  1984     $charset = HTML::Encoding::encoding_from_content_type($Content_Type);
  1985 
  1986     if (index($mode, '/') != -1) {   # a "/" means it's unknown or we'd have a mode here.
  1987         if ($ct eq 'text/css' and defined $url) {
  1988             print redirect
  1989                 'http://jigsaw.w3.org/css-validator/validator?uri=' .
  1990                 uri_escape $url;
  1991             exit;
  1992         }
  1993         elsif ($ct eq 'application/atom+xml' and defined $url) {
  1994             print redirect 'http://validator.w3.org/feed/check.cgi?url=' .
  1995                 uri_escape $url;
  1996             exit;
  1997         }
  1998         elsif ($ct =~ m(^application/.+\+xml$)) {
  1999 
  2000             # unknown media types which should be XML - we give these a try
  2001             $mode = "XML";
  2002         }
  2003         else {
  2004             $File->{'Error Flagged'} = TRUE;
  2005             my $tmpl = &get_error_template($File);
  2006             $tmpl->param(fatal_mime_error => TRUE);
  2007             $tmpl->param(fatal_mime_ct    => $ct);
  2008         }
  2009     }
  2010 
  2011     return $mode, $ct, $charset;
  2012 }
  2013 
  2014 #
  2015 # Get content with Content-Encodings decoded from a response.
  2016 sub get_content ($$)
  2017 {
  2018     my $File = shift;
  2019     my $res  = shift;
  2020 
  2021     my $content;
  2022     eval {
  2023         $content = $res->decoded_content(charset => 'none', raise_error => 1);
  2024     };
  2025     if ($@) {
  2026         (my $errmsg = $@) =~ s/ at .*//s;
  2027         my $cenc = $res->header("Content-Encoding");
  2028         my $uri  = $res->request->uri;
  2029         $File->{'Error Flagged'} = TRUE;
  2030         my $tmpl = &get_error_template($File);
  2031         $tmpl->param(fatal_decode_error  => TRUE);
  2032         $tmpl->param(fatal_decode_errmsg => $errmsg);
  2033         $tmpl->param(fatal_decode_cenc   => $cenc);
  2034 
  2035         # Include URI because it might be a subsystem (eg. HTML5 validator) one
  2036         $tmpl->param(fatal_decode_uri => $uri);
  2037     }
  2038 
  2039     return $content;
  2040 }
  2041 
  2042 #
  2043 # Check recursion level and enforce Max Recursion limit.
  2044 sub check_recursion ($$)
  2045 {
  2046     my $File = shift;
  2047     my $res  = shift;
  2048 
  2049     # Not looking at our own output.
  2050     return unless defined $res->header('X-W3C-Validator-Recursion');
  2051 
  2052     my $lvl = $res->header('X-W3C-Validator-Recursion');
  2053     return unless $lvl =~ m/^\d+$/;    # Non-digit, i.e. garbage, ignore.
  2054 
  2055     if ($lvl >= $CFG->{'Max Recursion'}) {
  2056         print redirect $File->{Env}->{'Home Page'};
  2057     }
  2058     else {
  2059 
  2060         # Increase recursion level in output.
  2061         &get_template($File, 'result.tmpl')->param(depth => $lvl++);
  2062     }
  2063 }
  2064 
  2065 #
  2066 # XML::LibXML::InputCallback matcher using our SGML search path jail.
  2067 sub xml_jail_match
  2068 {
  2069     my $arg = shift;
  2070 
  2071     # Ensure we have a file:// URI if we get a file.
  2072     my $uri = URI->new($arg);
  2073     if (!$uri->scheme()) {
  2074         $uri = URI::file->new_abs($arg);
  2075     }
  2076     $uri = $uri->canonical();
  2077 
  2078     # Do not trap non-file URIs.
  2079     return 0 unless ($uri->scheme() eq "file");
  2080 
  2081     # Do not trap file URIs within our jail.
  2082     for my $dir ($CFG->{Paths}->{SGML}->{Library},
  2083         split(/\Q$Config{path_sep}\E/o, $ENV{SGML_SEARCH_PATH} || ''))
  2084     {
  2085         next unless $dir;
  2086         my $dir_uri = URI::file->new_abs($dir)->canonical()->as_string();
  2087         $dir_uri =~ s|/*$|/|;    # ensure it ends with a slash
  2088         return 0 if ($uri =~ /^\Q$dir_uri\E/);
  2089     }
  2090 
  2091     # We have a match (a file outside the jail).
  2092     return 1;
  2093 }
  2094 
  2095 #
  2096 # Escape text to be included in markup comment.
  2097 sub escape_comment
  2098 {
  2099     local $_ = shift;
  2100     return '' unless defined;
  2101     s/--/- /g;
  2102     return $_;
  2103 }
  2104 
  2105 #
  2106 # Return $_[0] encoded for HTML entities (cribbed from merlyn).
  2107 #
  2108 # Note that this is used both for HTML and XML escaping (so e.g. no &apos;).
  2109 #
  2110 sub ent
  2111 {
  2112     my $str = shift;
  2113     return '' unless defined($str);    # Eliminate warnings
  2114 
  2115     # should switch to hex sooner or later
  2116     $str =~ s/&/&#38;/g;
  2117     $str =~ s/</&#60;/g;
  2118     $str =~ s/>/&#62;/g;
  2119     $str =~ s/"/&#34;/g;
  2120     $str =~ s/'/&#39;/g;
  2121 
  2122     return $str;
  2123 }
  2124 
  2125 #
  2126 # Truncate source lines for report.
  2127 # Expects 1-based column indexes.
  2128 sub truncate_line
  2129 {
  2130     my $line   = shift;
  2131     my $col    = shift;
  2132     my $maxlen = 80;      # max line length to truncate to
  2133 
  2134     my $diff = length($line) - $maxlen;
  2135 
  2136     # Don't truncate at all if it fits.
  2137     return ($line, $col) if ($diff <= 0);
  2138 
  2139     my $start = $col - int($maxlen / 2);
  2140     if ($start < 0) {
  2141 
  2142         # Truncate only from end of line.
  2143         $start = 0;
  2144         $line = substr($line, $start, $maxlen - 1) . '…';
  2145     }
  2146     elsif ($start > $diff) {
  2147 
  2148         # Truncate only from beginning of line.
  2149         $start = $diff;
  2150         $line = '…' . substr($line, $start + 1);
  2151     }
  2152     else {
  2153 
  2154         # Truncate from both beginning and end of line.
  2155         $line = '…' . substr($line, $start + 1, $maxlen - 2) . '…';
  2156     }
  2157 
  2158     # Shift column if we truncated from beginning of line.
  2159     $col -= $start;
  2160 
  2161     return ($line, $col);
  2162 }
  2163 
  2164 #
  2165 # Suppress any existing DOCTYPE by commenting it out.
  2166 sub override_doctype
  2167 {
  2168     my $File = shift;
  2169 
  2170     my ($dt) =
  2171         grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} }
  2172         values %{$CFG->{Types}};
  2173 
  2174     # @@TODO: abort/whine about unrecognized doctype if $dt is undef.;
  2175     my $pubid = $dt->{PubID};
  2176     my $sysid = $dt->{SysID};
  2177     my $name  = $dt->{Name};
  2178 
  2179     # The HTML5 PubID is a fake, reset it out of the way.
  2180     $pubid = undef if ($pubid eq 'HTML5');
  2181 
  2182     # We don't have public/system ids for all types.
  2183     my $dtd = "<!DOCTYPE $name";
  2184     if ($pubid) {
  2185         $dtd .= qq( PUBLIC "$pubid");
  2186         $dtd .= qq( "$sysid") if $sysid;
  2187     }
  2188     elsif ($sysid) {
  2189         $dtd .= qq( SYSTEM "$sysid");
  2190     }
  2191     $dtd .= '>';
  2192 
  2193     my $org_dtd      = '';
  2194     my $HTML         = '';
  2195     my $seen_doctype = FALSE;
  2196 
  2197     my $declaration = sub {
  2198         my ($tag, $text) = @_;
  2199         if ($seen_doctype || uc($tag) ne '!DOCTYPE') {
  2200             $HTML .= $text;
  2201             return;
  2202         }
  2203 
  2204         $seen_doctype = TRUE;
  2205 
  2206         $org_dtd = &ent($text);
  2207         ($File->{Root}, undef, $File->{DOCTYPE}) = $text =~
  2208             /<!DOCTYPE\s+(\w[\w\.-]+)(?:\s+(?:PUBLIC|SYSTEM)\s+(['"])(.*?)\2)?\s*>/si;
  2209 
  2210         $File->{DOCTYPE} = 'HTML5'
  2211             if (
  2212             lc($File->{Root} || '') eq 'html' &&
  2213             (!defined($File->{DOCTYPE}) ||
  2214                 $File->{DOCTYPE} eq 'about:legacy-compat')
  2215             );
  2216 
  2217         # No Override if Fallback was requested, or if override is the same as
  2218         # detected
  2219         my $known = $CFG->{Types}->{$File->{DOCTYPE}};
  2220         if ($File->{Opt}->{FB}->{DOCTYPE} or
  2221             ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display}))
  2222         {
  2223             $HTML .= $text;    # Stash it as is...
  2224         }
  2225         else {
  2226             $HTML .= "$dtd<!-- " . &escape_comment($text) . " -->";
  2227         }
  2228     };
  2229 
  2230     my $start_element = sub {
  2231         my $p = shift;
  2232         # Sneak chosen doctype before the root elt if none replaced thus far.
  2233         $HTML .= $dtd unless $seen_doctype;
  2234         $HTML .= shift;
  2235         # We're done with this handler.
  2236         $p->handler(start => undef);
  2237     };
  2238 
  2239     HTML::Parser->new(
  2240         default_h => [sub { $HTML .= shift }, 'text'],
  2241         declaration_h => [$declaration,   'tag,text'],
  2242         start_h       => [$start_element, 'self,text']
  2243     )->parse(join "\n", @{$File->{Content}})->eof();
  2244 
  2245     $File->{Content} = [split /\n/, $HTML];
  2246 
  2247     if ($seen_doctype) {
  2248         my $known = $CFG->{Types}->{$File->{DOCTYPE}};
  2249         unless ($File->{Opt}->{FB}->{DOCTYPE} or
  2250             ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display}))
  2251         {
  2252             &add_warning(
  2253                 'W13',
  2254                 {   W13_org => $org_dtd,
  2255                     W13_new => $File->{Opt}->{DOCTYPE},
  2256                 }
  2257             );
  2258             $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
  2259         }
  2260     }
  2261     else {
  2262         if ($File->{"DOCTYPEless OK"}) {
  2263             &add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}});
  2264         }
  2265         elsif ($File->{Opt}->{FB}->{DOCTYPE}) {
  2266             &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}});
  2267             $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
  2268         }
  2269         else {
  2270             &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}});
  2271             $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
  2272         }
  2273     }
  2274 
  2275     return $File;
  2276 }
  2277 
  2278 #
  2279 # Override inline charset declarations, for use e.g. when passing
  2280 # transcoded results to external parsers that use them.
  2281 sub override_charset ($$)
  2282 {
  2283     my ($File, $charset) = @_;
  2284 
  2285     my $ws = qr/[\x20\x09\x0D\x0A]/o;
  2286     my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o;
  2287 
  2288     my $content = join("\n", @{$File->{Content}});
  2289 
  2290     # Flatten newlines (so that we don't end up changing line numbers while
  2291     # overriding) and comment-escape a string.
  2292     sub escape_original ($)
  2293     {
  2294         my $str = shift;
  2295         $str =~ tr/\r\n/ /;
  2296         return &escape_comment($str);
  2297     }
  2298 
  2299     # <?xml encoding="charset"?>
  2300     $content =~ s/(
  2301               (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["']))
  2302               (${cs})
  2303               (\3.*?\?>)
  2304           )/lc($4) eq lc($charset) ?
  2305               "$1" : "$2$charset$5<!-- " . &escape_original($1) . " -->"/esx;
  2306 
  2307     # <meta charset="charset">
  2308     $content =~ s/(
  2309               (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*)
  2310               (${cs})
  2311               (.*?>)
  2312           )/lc($3) eq lc($charset) ?
  2313               "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
  2314 
  2315     # <meta http-equiv="content-type" content="some/type; charset=charset">
  2316     $content =~ s/(
  2317               (<meta\b[^>]*${ws}
  2318                   http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws}
  2319                   content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
  2320               (${cs})
  2321               (.*?>)
  2322           )/lc($3) eq lc($charset) ?
  2323               "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
  2324 
  2325     # <meta content="some/type; charset=charset" http-equiv="content-type">
  2326     $content =~ s/(
  2327               (<meta\b[^>]*${ws}
  2328                   content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
  2329               (${cs})
  2330               ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>)
  2331           )/lc($3) eq lc($charset) ?
  2332               "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
  2333 
  2334     $File->{Content} = [split /\n/, $content];
  2335 }
  2336 
  2337 sub set_error_uri ($$)
  2338 {
  2339     my ($err, $uri) = @_;
  2340 
  2341     # We want errors in the doc that was validated to appear without
  2342     # $err->{uri}, and non-doc errors with it pointing to the external entity
  2343     # or the like where the error is.  This usually works as long as we're
  2344     # passing docs to parsers as strings, but S::P::O (at least as of 0.994)
  2345     # seems to give us "3" as the FileName in those cases so we try to filter
  2346     # out everything that doesn't look like a useful URI.
  2347     if ($uri && index($uri, '/') != -1) {
  2348 
  2349         # Mask local file paths
  2350         my $euri = URI->new($uri);
  2351         if (!$euri->scheme() || $euri->scheme() eq 'file') {
  2352             $err->{uri_is_file} = TRUE;
  2353             $err->{uri}         = ($euri->path_segments())[-1];
  2354         }
  2355         else {
  2356             $err->{uri} = $euri->canonical();
  2357         }
  2358     }
  2359 }
  2360 
  2361 #
  2362 # Generate a HTML report of detected errors.
  2363 sub report_errors ($)
  2364 {
  2365     my $File   = shift;
  2366     my $Errors = [];
  2367     my %Errors_bytype;
  2368     my $number_of_errors   = 0;
  2369     my $number_of_warnings = 0;
  2370     my $number_of_info     = 0;
  2371 
  2372     # for the sake of readability, at least until the xmlwf errors have
  2373     # explanations, we push the errors from the XML parser at the END of the
  2374     # error list.
  2375     push @{$File->{Errors}}, @{$File->{WF_Errors}};
  2376 
  2377     if (scalar @{$File->{Errors}}) {
  2378         foreach my $err (@{$File->{Errors}}) {
  2379             my $col = 0;
  2380 
  2381             # Populate source/context for errors in our doc that don't have it
  2382             # already.  Checkers should always have populated $err->{src} with
  2383             # _something_ for non-doc errors.
  2384             if (!defined($err->{src})) {
  2385                 my $line = undef;
  2386 
  2387                 # Avoid truncating lines that do not exist.
  2388                 if (defined($err->{line}) &&
  2389                     $File->{Content}->[$err->{line} - 1])
  2390                 {
  2391                     if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/) {
  2392                         ($line, $col) =
  2393                             &truncate_line(
  2394                             $File->{Content}->[$err->{line} - 1],
  2395                             $err->{char});
  2396                         $line = &mark_error($line, $col);
  2397                     }
  2398                     elsif (defined($err->{line})) {
  2399                         $col = length($File->{Content}->[$err->{line} - 1]);
  2400                         $col = 80 if ($col > 80);
  2401                         ($line, $col) =
  2402                             &truncate_line(
  2403                             $File->{Content}->[$err->{line} - 1], $col);
  2404                         $line = &ent($line);
  2405                         $col  = 0;
  2406                     }
  2407                 }
  2408                 else {
  2409                     $col = 0;
  2410                 }
  2411                 $err->{src} = $line;
  2412             }
  2413 
  2414             my $explanation = "";
  2415             if ($err->{expl}) {
  2416 
  2417             }
  2418             else {
  2419                 if ($err->{num}) {
  2420                     my $num = $err->{num};
  2421                     $explanation .= Encode::decode_utf8(
  2422                         "\n    $RSRC{msg}->{$num}->{verbose}\n")
  2423                         if exists $RSRC{msg}->{$num} &&
  2424                             exists $RSRC{msg}->{$num}->{verbose};
  2425                     my $_msg = $RSRC{msg}->{nomsg}->{verbose};
  2426                     $_msg =~ s/<!--MID-->/$num/g;
  2427                     if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) {
  2428                         $_msg =~ s/<!--URI-->//g;
  2429                     }
  2430                     else {
  2431                         my $escaped_uri = uri_escape($File->{URI});
  2432                         $_msg =~ s/<!--URI-->/$escaped_uri/g;
  2433                     }
  2434 
  2435                     # The send feedback plea.
  2436                     $explanation = "    $_msg\n$explanation";
  2437                     $explanation =~ s/<!--#echo\s+var="relroot"\s*-->//g;
  2438                 }
  2439                 $err->{expl} = $explanation;
  2440             }
  2441 
  2442             $err->{col} = ' ' x $col;
  2443             if ($err->{type} eq 'I') {
  2444                 $err->{class}         = 'msg_info';
  2445                 $err->{err_type_err}  = 0;
  2446                 $err->{err_type_warn} = 0;
  2447                 $err->{err_type_info} = 1;
  2448                 $number_of_info += 1;
  2449             }
  2450             elsif ($err->{type} eq 'E') {
  2451                 $err->{class}         = 'msg_err';
  2452                 $err->{err_type_err}  = 1;
  2453                 $err->{err_type_warn} = 0;
  2454                 $err->{err_type_info} = 0;
  2455                 $number_of_errors += 1;
  2456             }
  2457             elsif (($err->{type} eq 'W') or ($err->{type} eq 'X')) {
  2458                 $err->{class}         = 'msg_warn';
  2459                 $err->{err_type_err}  = 0;
  2460                 $err->{err_type_warn} = 1;
  2461                 $err->{err_type_info} = 0;
  2462                 $number_of_warnings += 1;
  2463             }
  2464 
  2465             # TODO other classes for "X" etc? FIXME find all types of message.
  2466 
  2467             push @{$Errors}, $err;
  2468 
  2469             if (($File->{Opt}->{'Group Errors'}) and
  2470                 (($err->{type} eq 'E') or
  2471                     ($err->{type} eq 'W') or
  2472                     ($err->{type} eq 'X'))
  2473                 )
  2474             {
  2475 
  2476                 # index by num for errors and warnings only - info usually
  2477                 # gives context of error or warning
  2478                 if (!exists $Errors_bytype{$err->{num}}) {
  2479                     $Errors_bytype{$err->{num}}->{instances} = [];
  2480                     my $msg_text;
  2481                     if ($err->{num} eq 'xmlwf') {
  2482 
  2483                         # FIXME need a catalog of errors from XML::LibXML
  2484                         $msg_text = "XML Parsing Error";
  2485                     }
  2486                     elsif ($err->{num} eq 'html5') {
  2487                         $msg_text = "HTML5 Validator Error";
  2488                     }
  2489                     else {
  2490                         $msg_text = $RSRC{msg}->{$err->{num}}->{original};
  2491                         $msg_text =~ s/%1/X/;
  2492                         $msg_text =~ s/%2/Y/;
  2493                     }
  2494                     $Errors_bytype{$err->{num}}->{expl}        = $err->{expl};
  2495                     $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text;
  2496                     $Errors_bytype{$err->{num}}->{msg}         = $err->{msg};
  2497                     $Errors_bytype{$err->{num}}->{type}        = $err->{type};
  2498                     $Errors_bytype{$err->{num}}->{class}       = $err->{class};
  2499                     $Errors_bytype{$err->{num}}->{err_type_err} =
  2500                         $err->{err_type_err};
  2501                     $Errors_bytype{$err->{num}}->{err_type_warn} =
  2502                         $err->{err_type_warn};
  2503                     $Errors_bytype{$err->{num}}->{err_type_info} =
  2504                         $err->{err_type_info};
  2505                 }
  2506                 push @{$Errors_bytype{$err->{num}}->{instances}}, $err;
  2507             }
  2508         }
  2509     }
  2510 
  2511     @$Errors = values(%Errors_bytype) if $File->{Opt}->{'Group Errors'};
  2512 
  2513     # we are not sorting errors by line, as it would break the position
  2514     # of auxiliary messages such as "start tag was here". We'll have to live
  2515     # with the fact that XML well-formedness errors are listed first, then
  2516     # validation errors
  2517     #else {
  2518     #   sort error by lines
  2519     #  @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors};
  2520     #}
  2521     return $number_of_errors, $number_of_warnings, $number_of_info, $Errors;
  2522 }
  2523 
  2524 #
  2525 # Chop the source line into 3 pieces; the character at which the error
  2526 # was detected, and everything to the left and right of that position.
  2527 # That way we can add markup to the relevant char without breaking &ent().
  2528 # Expects 1-based column indexes.
  2529 sub mark_error ($$)
  2530 {
  2531     my $line    = shift;
  2532     my $col     = shift;
  2533     my $linelen = length($line);
  2534 
  2535     # Coerce column into an index valid within the line.
  2536     if ($col < 1) {
  2537         $col = 1;
  2538     }
  2539     elsif ($col > $linelen) {
  2540         $col = $linelen;
  2541     }
  2542     $col--;
  2543 
  2544     my $left = substr($line, 0,    $col);
  2545     my $char = substr($line, $col, 1);
  2546     my $right = substr($line, $col + 1);
  2547 
  2548     $char = &ent($char);
  2549     $char =
  2550         qq(<strong title="Position where error was detected.">$char</strong>);
  2551     $line = &ent($left) . $char . &ent($right);
  2552 
  2553     return $line;
  2554 }
  2555 
  2556 #
  2557 # Create a HTML representation of the document.
  2558 sub source
  2559 {
  2560     my $File = shift;
  2561 
  2562     # Remove any BOM since we're not at BOT anymore...
  2563     $File->{Content}->[0] = substr($File->{Content}->[0], 1)
  2564         if ($File->{BOM} && scalar(@{$File->{Content}}));
  2565 
  2566     my @source = map({file_source_line => $_}, @{$File->{Content}});
  2567     return \@source;
  2568 }
  2569 
  2570 sub match_DTD_FPI_SI
  2571 {
  2572     my ($File, $FPI, $SI) = @_;
  2573     if ($CFG->{Types}->{$FPI}) {
  2574         if ($CFG->{Types}->{$FPI}->{SysID}) {
  2575             if ($SI ne $CFG->{Types}->{$FPI}->{SysID}) {
  2576                 &add_warning(
  2577                     'W26',
  2578                     {   W26_dtd_pub => $FPI,
  2579                         W26_dtd_pub_display =>
  2580                             $CFG->{Types}->{$FPI}->{Display},
  2581                         W26_dtd_sys           => $SI,
  2582                         W26_dtd_sys_recommend => $CFG->{Types}->{$FPI}->{SysID}
  2583                     }
  2584                 );
  2585             }
  2586         }
  2587     }
  2588     else {    # FPI not known, checking if the SI is
  2589         while (my ($proper_FPI, $value) = each %{$CFG->{Types}}) {
  2590             if ($value->{SysID} && $value->{SysID} eq $SI) {
  2591                 &add_warning(
  2592                     'W26',
  2593                     {   W26_dtd_pub           => $FPI,
  2594                         W26_dtd_pub_display   => $value->{Display},
  2595                         W26_dtd_sys           => $SI,
  2596                         W26_dtd_pub_recommend => $proper_FPI
  2597                     }
  2598                 );
  2599             }
  2600         }
  2601     }
  2602 }
  2603 
  2604 #
  2605 # Do an initial parse of the Document Entity to extract FPI.
  2606 sub preparse_doctype
  2607 {
  2608     my $File = shift;
  2609 
  2610     #
  2611     # Reset DOCTYPE, Root (for second invocation, probably not needed anymore).
  2612     $File->{DOCTYPE} = '';
  2613     $File->{Root}    = '';
  2614 
  2615     my $dtd = sub {
  2616         return if $File->{Root};
  2617 
  2618         # TODO: The \s and \w are probably wrong now that the strings are
  2619         # utf8_on
  2620         my $declaration = shift;
  2621         my $doctype_type;
  2622         my $doctype_secondpart;
  2623         if ($declaration =~
  2624             /<!DOCTYPE\s+html(?:\s+SYSTEM\s+(['"])about:legacy-compat\1)?\s*>/si
  2625             )
  2626         {
  2627             $File->{Root}    = "html";
  2628             $File->{DOCTYPE} = "HTML5";
  2629         }
  2630         elsif ($declaration =~
  2631             m(<!DOCTYPE\s+(\w[\w\.-]+)\s+(PUBLIC|SYSTEM)\s+(?:[\'\"])([^\"\']+)(?:[\"\'])(.*)>)si
  2632             )
  2633         {
  2634             (   $File->{Root},    $doctype_type,
  2635                 $File->{DOCTYPE}, $doctype_secondpart
  2636             ) = ($1, $2, $3, $4);
  2637             if (($doctype_type eq "PUBLIC") and
  2638                 (($doctype_secondpart) =
  2639                     $doctype_secondpart =~
  2640                     m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si)
  2641                 )
  2642             {
  2643                 &match_DTD_FPI_SI($File, $File->{DOCTYPE},
  2644                     $doctype_secondpart);
  2645             }
  2646         }
  2647     };
  2648 
  2649     my $start = sub {
  2650         my ($p, $tag, $attr) = @_;
  2651 
  2652         if ($File->{Root}) {
  2653             return unless $tag eq $File->{Root};
  2654         }
  2655         else {
  2656             $File->{Root} = $tag;
  2657         }
  2658         if ($attr->{xmlns}) {
  2659             $File->{Namespace} = $attr->{xmlns};
  2660         }
  2661         if ($attr->{version}) {
  2662             $File->{'Root Version'} = $attr->{version};
  2663         }
  2664         if ($attr->{baseProfile}) {
  2665             $File->{'Root BaseProfile'} = $attr->{baseProfile};
  2666         }
  2667 
  2668         # We're done parsing.
  2669         $p->eof();
  2670     };
  2671 
  2672     # we use HTML::Parser as pre-parser. May use html5lib or other in the future
  2673     my $p = HTML::Parser->new(api_version => 3);
  2674 
  2675     # if content-type has shown we should pre-parse with XML mode, use that
  2676     # otherwise (mostly text/html cases) use default mode
  2677     $p->xml_mode(&is_xml($File));
  2678     $p->handler(declaration => $dtd,   'text');
  2679     $p->handler(start       => $start, 'self,tag,attr');
  2680 
  2681     my $line = 0;
  2682     my $max  = scalar(@{$File->{Content}});
  2683     $p->parse(
  2684         sub {
  2685             return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef;
  2686         }
  2687     );
  2688     $p->eof();
  2689 
  2690     # TODO: These \s here are probably wrong now that the strings are utf8_on
  2691     $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE};
  2692     $File->{DOCTYPE} =~ s(^\s+){ }g;
  2693     $File->{DOCTYPE} =~ s(\s+$){ }g;
  2694     $File->{DOCTYPE} =~ s(\s+) { }g;
  2695 
  2696     # Some document types actually need no doctype to be identified,
  2697     # root element and some version attribute is enough
  2698     # TODO applicable doctypes should be migrated to a config file?
  2699 
  2700     # if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) {
  2701     #   if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'}))
  2702     #   {
  2703     #     if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; }
  2704     #     if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; }
  2705     #     if ($File->{'Root Version'} eq "1.0"){
  2706     #       $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN";
  2707     #       $File->{"DOCTYPEless OK"} = TRUE;
  2708     #       $File->{Opt}->{DOCTYPE} = "SVG 1.0";
  2709     #     }
  2710     #     if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) {
  2711     #         $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Tiny//EN";
  2712     #         $File->{"DOCTYPEless OK"} = TRUE;
  2713     #         $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny";
  2714     #     }
  2715     #     elsif ((($File->{'Root Version'} eq "1.1")  or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) {
  2716     #         $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Basic//EN";
  2717     #         $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic";
  2718     #         $File->{"DOCTYPEless OK"} = TRUE;
  2719     #     }
  2720     #     elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) {
  2721     #         $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN";
  2722     #         $File->{Opt}->{DOCTYPE} = "SVG 1.1";
  2723     #         $File->{"DOCTYPEless OK"} = TRUE;
  2724     #     }
  2725     #     if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; }
  2726     #     if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; }
  2727     #   }
  2728     #   else {
  2729     #     # by default for an svg root elt, we use SVG 1.1
  2730     #     $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN";
  2731     #     $File->{Opt}->{DOCTYPE} = "SVG 1.1";
  2732     #     $File->{"DOCTYPEless OK"} = TRUE;
  2733     #   }
  2734     # }
  2735     if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) {
  2736 
  2737         # doctypeless document type found, we fake the override
  2738         # so that the parser will have something to validate against
  2739         $File = &override_doctype($File);
  2740     }
  2741     return $File;
  2742 }
  2743 
  2744 #
  2745 # Preprocess CGI parameters.
  2746 sub prepCGI
  2747 {
  2748     my $File = shift;
  2749     my $q    = shift;
  2750 
  2751     # The URL to this CGI script.
  2752     $File->{Env}->{'Self URI'} = $q->url();
  2753 
  2754     # Decode parameter values, set booleans the way we expect them.
  2755     foreach my $param ($q->param()) {
  2756 
  2757         # 'uploaded_file' and 'fragment' contain data we treat as is.
  2758         next if ($param eq 'uploaded_file' || $param eq 'fragment');
  2759 
  2760         # Decode all other defined values as UTF-8.
  2761         my @values = map { Encode::decode_utf8($_) } $q->param($param);
  2762         $q->param($param, @values);
  2763 
  2764         # Skip parameters that should not be treated as booleans.
  2765         next if $param =~ /^(?:accept(?:-(?:language|charset))?|ur[il])$/;
  2766 
  2767         # Keep false-but-set params.
  2768         next if $q->param($param) eq '0';
  2769 
  2770         # Parameters that are given to us without specifying a value get set
  2771         # to a true value.
  2772         $q->param($param, TRUE) unless $q->param($param);
  2773     }
  2774 
  2775     $File->{Env}->{'Home Page'} =
  2776         URI->new_abs(".", $File->{Env}->{'Self URI'});
  2777 
  2778     # Use "url" unless a "uri" was also given.
  2779     if ($q->param('url') and not $q->param('uri')) {
  2780         $q->param('uri', $q->param('url'));
  2781     }
  2782 
  2783     # Set output mode; needed in get_error_template if we end up there.
  2784     $File->{Opt}->{Output} = $q->param('output') || 'html';
  2785 
  2786     # Issue a redirect for uri=referer.
  2787     if ($q->param('uri') and $q->param('uri') eq 'referer') {
  2788         if ($q->referer) {
  2789             $q->param('uri', $q->referer);
  2790             $q->param('accept', $q->http('Accept')) if ($q->http('Accept'));
  2791             $q->param('accept-language', $q->http('Accept-Language'))
  2792                 if ($q->http('Accept-Language'));
  2793             $q->param('accept-charset', $q->http('Accept-Charset'))
  2794                 if ($q->http('Accept-Charset'));
  2795             print redirect(-uri => &self_url_q($q, $File), -vary => 'Referer');
  2796             exit;
  2797         }
  2798         else {
  2799 
  2800             # No Referer header was found.
  2801             $File->{'Error Flagged'} = TRUE;
  2802             &get_error_template($File)->param(fatal_referer_error => TRUE);
  2803         }
  2804     }
  2805 
  2806     # Supersede URL with an uploaded file.
  2807     if ($q->param('uploaded_file')) {
  2808         $q->param('uri', 'upload://' . $q->param('uploaded_file'));
  2809         $File->{'Is Upload'} = TRUE;    # Tag it for later use.
  2810     }
  2811 
  2812     # Supersede URL with an uploaded fragment.
  2813     if ($q->param('fragment')) {
  2814         $q->param('uri', 'upload://Form Submission');
  2815         $File->{'Direct Input'} = TRUE;    # Tag it for later use.
  2816     }
  2817 
  2818     # Redirect to a GETable URL if method is POST without a file upload.
  2819     if (defined $q->request_method and
  2820         $q->request_method eq 'POST' and
  2821         not($File->{'Is Upload'} or $File->{'Direct Input'}))
  2822     {
  2823         my $thispage = &self_url_q($q, $File);
  2824         print redirect $thispage;
  2825         exit;
  2826     }
  2827 
  2828     #
  2829     # Flag an error if we didn't get a file to validate.
  2830     unless ($q->param('uri')) {
  2831         $File->{'Error Flagged'} = TRUE;
  2832         my $tmpl = &get_error_template($File);
  2833         $tmpl->param(fatal_uri_error  => TRUE);
  2834         $tmpl->param(fatal_uri_scheme => 'undefined');
  2835     }
  2836 
  2837     return $q;
  2838 }
  2839 
  2840 #
  2841 # Set parse mode (SGML or XML) based on a number of preparsed factors:
  2842 # * HTTP Content-Type
  2843 # * Doctype Declaration
  2844 # * XML Declaration
  2845 # * XML namespaces
  2846 sub set_parse_mode
  2847 {
  2848     my $File = shift;
  2849     my $CFG  = shift;
  2850     my $fpi  = $File->{DOCTYPE};
  2851     $File->{ModeChoice} = '';
  2852     my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD';
  2853 
  2854     my $xmlws = qr/[\x20\x09\x0D\x0A]/o;
  2855 
  2856     # $File->{Mode} may have been set in parse_content_type
  2857     # and it would come from the Media Type
  2858     my $parseModeFromMimeType = $File->{Mode};
  2859     my $begincontent          = join "\x20",
  2860         @{$File->{Content}};    # for the sake of xml decl detection,
  2861                                 # the 10 first lines should be safe
  2862     my $parseModeFromXMLDecl = (
  2863         $begincontent =~
  2864             /^ ${xmlws}*                # whitespace before the decl should not be happening
  2865                                         # but we are greedy for the sake of detection, not validation
  2866       <\?xml ${xmlws}+                  # start matching an XML Declaration
  2867       version ${xmlws}* =               # for documents, version info is mandatory
  2868       ${xmlws}* (["'])1.[01]\1          # hardcoding the existing XML versions.
  2869                                         # Maybe we should use \d\.\d
  2870       (?:${xmlws}+ encoding
  2871        ${xmlws}* = ${xmlws}*
  2872        (["'])[A-Za-z][a-zA-Z0-9_-]+\2
  2873       )?                                # encoding info is optional
  2874       (?:${xmlws}+ standalone
  2875        ${xmlws}* = ${xmlws}*
  2876        (["'])(?:yes|no)\3
  2877       )?                                # ditto standalone info, optional
  2878       ${xmlws}* \?>                     # end of XML Declaration
  2879     /ox
  2880         ?
  2881             'XML' :
  2882             'TBD'
  2883     );
  2884 
  2885     my $parseModeFromNamespace = 'TBD';
  2886     # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9967
  2887     $parseModeFromNamespace = 'XML'
  2888         if ($File->{Namespace} && $parseModeFromDoctype ne 'HTML5');
  2889 
  2890     if (($parseModeFromMimeType eq 'TBD') and
  2891         ($parseModeFromXMLDecl   eq 'TBD') and
  2892         ($parseModeFromNamespace eq 'TBD') and
  2893         (!exists $CFG->{Types}->{$fpi}))
  2894     {
  2895 
  2896         # if the mime type is text/html (ambiguous, hence TBD mode)
  2897         # and the doctype isn't in the catalogue
  2898         # and XML prolog detection was unsuccessful
  2899         # and we found no namespace at the root
  2900         # ... throw in a warning
  2901         &add_warning(
  2902             'W06',
  2903             {   W06_mime    => $File->{ContentType},
  2904                 w06_doctype => $File->{DOCTYPE}
  2905             }
  2906         );
  2907         return;
  2908     }
  2909 
  2910     $parseModeFromDoctype = 'TBD'
  2911         unless $parseModeFromDoctype eq 'SGML' or
  2912             $parseModeFromDoctype eq 'HTML5' or
  2913             $parseModeFromDoctype eq 'XML'   or
  2914             $parseModeFromNamespace eq 'XML';
  2915 
  2916     if (($parseModeFromDoctype eq 'TBD') and
  2917         ($parseModeFromXMLDecl  eq 'TBD') and
  2918         ($parseModeFromMimeType eq 'TBD') and
  2919         ($parseModeFromNamespace eq 'TBD'))
  2920     {
  2921 
  2922         # if all factors are useless to give us a parse mode
  2923         # => we use SGML-based DTD validation as a default
  2924         $File->{Mode}       = 'DTD+SGML';
  2925         $File->{ModeChoice} = 'Fallback';
  2926 
  2927         # and send warning about the fallback
  2928         &add_warning(
  2929             'W06',
  2930             {   W06_mime    => $File->{ContentType},
  2931                 w06_doctype => $File->{DOCTYPE}
  2932             }
  2933         );
  2934         return;
  2935     }
  2936 
  2937     if ($parseModeFromMimeType ne 'TBD') {
  2938 
  2939         # if The mime type gives clear indication of whether the document is
  2940         # XML or not
  2941         if (($parseModeFromDoctype ne 'TBD') and
  2942             ($parseModeFromDoctype ne 'HTML5') and
  2943             ($parseModeFromMimeType ne $parseModeFromDoctype))
  2944         {
  2945 
  2946             # if document-type recommended mode and content-type recommended
  2947             # mode clash, shoot a warning
  2948             # unknown doctypes will not trigger this
  2949             # neither will html5 documents, which can be XML or not
  2950             &add_warning(
  2951                 'W07',
  2952                 {   W07_mime => $File->{ContentType},
  2953                     W07_ct   => $parseModeFromMimeType,
  2954                     W07_dtd  => $parseModeFromDoctype,
  2955                 }
  2956             );
  2957         }
  2958 
  2959         # mime type has precedence, we stick to it
  2960         $File->{ModeChoice} = 'Mime';
  2961         if ($parseModeFromDoctype eq "HTML5") {
  2962             $File->{Mode} = 'HTML5+' . $File->{Mode};
  2963         }
  2964         else {
  2965             $File->{Mode} = 'DTD+' . $File->{Mode};
  2966         }
  2967         return;
  2968     }
  2969 
  2970     if ($parseModeFromDoctype ne 'TBD') {
  2971 
  2972         # the mime type is ambiguous (hence we didn't stop at the previous test)
  2973         # but by now we're sure that the document type is a good indication
  2974         # so we use that.
  2975         if ($parseModeFromDoctype eq "HTML5") {
  2976             if ($parseModeFromXMLDecl eq "XML" or
  2977                 $parseModeFromNamespace eq "XML")
  2978             {
  2979                 $File->{Mode} = "HTML5+XML";
  2980             }
  2981             else {
  2982                 $File->{Mode} = "HTML5";
  2983             }
  2984         }
  2985         else {    # not HTML5
  2986             $File->{Mode} = "DTD+" . $parseModeFromDoctype;
  2987         }
  2988         $File->{ModeChoice} = 'Doctype';
  2989         return;
  2990     }
  2991 
  2992     if ($parseModeFromXMLDecl ne 'TBD') {
  2993 
  2994         # the mime type is ambiguous (hence we didn't stop at the previous test)
  2995         # and so was the doctype
  2996         # but we found an XML declaration so we use that.
  2997         if ($File->{Mode} eq "") {
  2998             $File->{Mode} = "DTD+" . $parseModeFromXMLDecl;
  2999         }
  3000         elsif ((my $ix = index($File->{Mode}, '+')) != -1) {
  3001             substr($File->{Mode}, $ix + 1) = $parseModeFromXMLDecl;
  3002         }
  3003         else {
  3004             $File->{Mode} = $File->{Mode} . "+" . $parseModeFromXMLDecl;
  3005         }
  3006         $File->{ModeChoice} = 'XMLDecl';
  3007         return;
  3008     }
  3009 
  3010     # this is the last case. We know that all  modes are not TBD,
  3011     # yet mime type, doctype AND XML DECL tests have failed => we are saved
  3012     # by the presence of namespaces
  3013     if ($File->{Mode} eq "") {
  3014         $File->{Mode} = "DTD+" . $parseModeFromNamespace;
  3015     }
  3016     elsif ((my $ix = index($File->{Mode}, '+')) != -1) {
  3017         substr($File->{Mode}, $ix + 1) = $parseModeFromNamespace;
  3018     }
  3019     else {
  3020         $File->{Mode} = $File->{Mode} . "+" . $parseModeFromNamespace;
  3021     }
  3022     $File->{ModeChoice} = 'Namespace';
  3023 }
  3024 
  3025 #
  3026 # Utility sub to tell if mode "is" XML.
  3027 sub is_xml
  3028 {
  3029     index(shift->{Mode}, 'XML') != -1;
  3030 }
  3031 
  3032 #
  3033 # Check charset conflicts and add any warnings necessary.
  3034 sub charset_conflicts
  3035 {
  3036     my $File = shift;
  3037 
  3038     #
  3039     # Handle the case where there was no charset to be found.
  3040     unless ($File->{Charset}->{Use}) {
  3041         &add_warning('W17', {});
  3042         $File->{Tentative} |= T_WARN;
  3043     }
  3044 
  3045     #
  3046     # Add a warning if there was charset info conflict (HTTP header,
  3047     # XML declaration, or <meta> element).
  3048     # filtering out some of the warnings in direct input mode where HTTP
  3049     # encoding is a "fake"
  3050     if ((   charset_not_equal(
  3051                 $File->{Charset}->{HTTP},
  3052                 $File->{Charset}->{XML}
  3053             )
  3054         ) and
  3055         not($File->{'Direct Input'})
  3056         )
  3057     {
  3058         &add_warning(
  3059             'W18',
  3060             {   W18_http => $File->{Charset}->{HTTP},
  3061                 W18_xml  => $File->{Charset}->{XML},
  3062                 W18_use  => $File->{Charset}->{Use},
  3063             }
  3064         );
  3065     }
  3066     elsif (
  3067         charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META})
  3068         and
  3069         not($File->{'Direct Input'}))
  3070     {
  3071         &add_warning(
  3072             'W19',
  3073             {   W19_http => $File->{Charset}->{HTTP},
  3074                 W19_meta => $File->{Charset}->{META},
  3075                 W19_use  => $File->{Charset}->{Use},
  3076             }
  3077         );
  3078     }
  3079     elsif (
  3080         charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META}))
  3081     {
  3082         &add_warning(
  3083             'W20',
  3084             {   W20_xml  => $File->{Charset}->{XML},
  3085                 W20_meta => $File->{Charset}->{META},
  3086             }
  3087         );
  3088         $File->{Tentative} |= T_WARN;
  3089     }
  3090 
  3091     return $File;
  3092 }
  3093 
  3094 #
  3095 # Transcode to UTF-8
  3096 sub transcode
  3097 {
  3098     my $File = shift;
  3099 
  3100     my $general_charset = $File->{Charset}->{Use};
  3101     my $exact_charset   = $general_charset;
  3102 
  3103     # TODO: This should be done before transcode()
  3104     if ($general_charset eq 'utf-16') {
  3105         if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
  3106             $exact_charset = $File->{Charset}->{Auto};
  3107         }
  3108         else { $exact_charset = 'utf-16be'; }
  3109     }
  3110 
  3111     my $cs = $exact_charset;
  3112 
  3113     if ($CFG->{Charsets}->{$cs}) {
  3114         if (index($CFG->{Charsets}->{$cs}, 'ERR ') != -1) {
  3115 
  3116             # The encoding is not supported due to policy
  3117 
  3118             $File->{'Error Flagged'} = TRUE;
  3119             my $tmpl = &get_error_template($File);
  3120             $tmpl->param(fatal_transcode_error   => TRUE);
  3121             $tmpl->param(fatal_transcode_charset => $cs);
  3122 
  3123             # @@FIXME might need better text
  3124             $tmpl->param(fatal_transcode_errmsg =>
  3125                     'This encoding is not supported by the validator.');
  3126             return $File;
  3127         }
  3128         elsif (index($CFG->{Charsets}->{$cs}, 'X ') != -1) {
  3129 
  3130             # possibly problematic, we recommend another alias
  3131             my $recommended_charset = $CFG->{Charsets}->{$cs};
  3132             $recommended_charset =~ s/X //;
  3133             &add_warning(
  3134                 'W22',
  3135                 {   W22_declared  => $cs,
  3136                     W22_suggested => $recommended_charset,
  3137                 }
  3138             );
  3139         }
  3140     }
  3141 
  3142     # Does the system support decoding this encoding?
  3143     my $enc = Encode::find_encoding($cs);
  3144 
  3145     if (!$enc) {
  3146 
  3147         # This system's Encode installation does not support
  3148         # the character encoding; might need additional modules
  3149 
  3150         $File->{'Error Flagged'} = TRUE;
  3151         my $tmpl = &get_error_template($File);
  3152         $tmpl->param(fatal_transcode_error   => TRUE);
  3153         $tmpl->param(fatal_transcode_charset => $cs);
  3154 
  3155         # @@FIXME might need better text
  3156         $tmpl->param(fatal_transcode_errmsg => 'Encoding not supported.');
  3157         return $File;
  3158     }
  3159     elsif (!$CFG->{Charsets}->{$cs}) {
  3160 
  3161         # not in the list, but technically OK -> we warn
  3162         &add_warning('W24', {W24_declared => $cs,});
  3163 
  3164     }
  3165 
  3166     my $output;
  3167     my $input = $File->{Bytes};
  3168 
  3169     # Try to transcode
  3170     eval { $output = $enc->decode($input, Encode::FB_CROAK); };
  3171 
  3172     if ($@) {
  3173 
  3174         # Transcoding failed - do it again line by line to find out exactly
  3175         # where
  3176         my $line_num = 0;
  3177         while ($input =~ /(.*?)(?:\r\n|\n|\r|\z)/g) {
  3178             $line_num++;
  3179             eval { $enc->decode($1, Encode::FB_CROAK); };
  3180             if ($@) {
  3181                 my $croak_message = $@;
  3182                 $croak_message =~ s/ at .*//;
  3183                 $File->{'Error Flagged'} = TRUE;
  3184                 my $tmpl = &get_error_template($File);
  3185                 $tmpl->param(fatal_byte_error     => TRUE);
  3186                 $tmpl->param(fatal_byte_lines     => $line_num);
  3187                 $tmpl->param(fatal_byte_charset   => $cs);
  3188                 $tmpl->param(fatal_byte_error_msg => $croak_message);
  3189                 last;
  3190             }
  3191         }
  3192         return $File;
  3193     }
  3194 
  3195     # @@FIXME is this what we want?
  3196     $output =~ s/\015?\012/\n/g;
  3197 
  3198     # make sure we deal only with unix newlines
  3199     # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992
  3200     $output =~ s/(\r\n|\n|\r)/\n/g;
  3201 
  3202     #debug: we could check if the content has utf8 bit on with
  3203     #$output= utf8::is_utf8($output) ? 1 : 0;
  3204     $File->{Content} = [split /\n/, $output];
  3205 
  3206     return $File;
  3207 }
  3208 
  3209 sub find_encodings
  3210 {
  3211     my $File  = shift;
  3212     my $bom   = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes});
  3213     my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes});
  3214 
  3215     if (defined $bom) {
  3216 
  3217         # @@FIXME this BOM entry should not be needed at all!
  3218         $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}"));
  3219         $File->{Charset}->{Auto} = lc $bom;
  3220     }
  3221     else {
  3222         $File->{Charset}->{Auto} = lc($first[0]) if @first;
  3223     }
  3224 
  3225     my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes});
  3226     $File->{Charset}->{XML} = lc $xml if defined $xml;
  3227 
  3228     my %metah;
  3229     foreach my $try (@first) {
  3230 
  3231         # @@FIXME I think the old code used HTML::Parser xml mode, check if ok
  3232         my $meta =
  3233             HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try);
  3234         $metah{lc($meta)}++ if defined $meta and length $meta;
  3235     }
  3236 
  3237     if (!%metah) {
  3238 
  3239         # HTML::Encoding doesn't support HTML5 <meta charset> as of 0.60,
  3240         # check it ourselves.  HTML::HeadParser >= 3.60 is required for this.
  3241 
  3242         my $hp           = HTML::HeadParser->new();
  3243         my $seen_doctype = FALSE;
  3244         my $is_html5     = FALSE;
  3245         $hp->handler(
  3246             declaration => sub {
  3247                 my ($tag, $text) = @_;
  3248                 return if ($seen_doctype || uc($tag) ne '!DOCTYPE');
  3249                 $seen_doctype = TRUE;
  3250                 $is_html5     = TRUE
  3251                     if (
  3252                     $text =~ /<!DOCTYPE\s+html
  3253                                     (\s+SYSTEM\s+(['"])about:legacy-compat\2)?
  3254                                     \s*>/six
  3255                     );
  3256             },
  3257             'tag,text'
  3258         );
  3259         $hp->parse($File->{Bytes});
  3260         if ($is_html5) {
  3261             my $cs = $hp->header('X-Meta-Charset');
  3262             $metah{lc($cs)}++ if (defined($cs) && length($cs));
  3263         }
  3264     }
  3265 
  3266     if (%metah) {
  3267         my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah;
  3268         $File->{Charset}->{META} = $meta[0];
  3269     }
  3270 
  3271     return $File;
  3272 }
  3273 
  3274 #
  3275 # Abort with a message if an error was flagged at point.
  3276 sub abort_if_error_flagged
  3277 {
  3278     my $File = shift;
  3279 
  3280     return unless $File->{'Error Flagged'};
  3281     return if $File->{'Error Handled'};    # Previous error, keep going.
  3282 
  3283     my $tmpl = &get_error_template($File);
  3284     $tmpl->param(fatal_error => TRUE);
  3285 
  3286     &prep_template($File, $tmpl);
  3287 
  3288     # transcode output from perl's internal to utf-8 and output
  3289     print Encode::encode('UTF-8', $tmpl->output);
  3290     exit;
  3291 }
  3292 
  3293 #
  3294 # conflicting encodings
  3295 sub charset_not_equal
  3296 {
  3297     my $encodingA = shift;
  3298     my $encodingB = shift;
  3299     return $encodingA && $encodingB && ($encodingA ne $encodingB);
  3300 }
  3301 
  3302 #
  3303 # Construct a self-referential URL from a CGI.pm $q object.
  3304 sub self_url_q
  3305 {
  3306     my ($q, $File) = @_;
  3307     my $thispage = $File->{Env}->{'Self URI'} . '?';
  3308 
  3309     # Pass-through parameters
  3310     for my $param (qw(uri accept accept-language accept-charset)) {
  3311         $thispage .= "$param=" . uri_escape($q->param($param)) . ';'
  3312             if $q->param($param);
  3313     }
  3314 
  3315     # Boolean parameters
  3316     for my $param (qw(ss outline No200 verbose group)) {
  3317         $thispage .= "$param=1;" if $q->param($param);
  3318     }
  3319 
  3320     # Others
  3321     if ($q->param('doctype') and $q->param('doctype') !~ /(?:Inline|detect)/i)
  3322     {
  3323         $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';';
  3324     }
  3325     if ($q->param('charset') and $q->param('charset') !~ /detect/i) {
  3326         $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';';
  3327     }
  3328 
  3329     $thispage =~ s/[\?;]$//;
  3330     return $thispage;
  3331 }
  3332 
  3333 #
  3334 # Construct a self-referential URL from a $File object.
  3335 sub self_url_file
  3336 {
  3337     my $File = shift;
  3338 
  3339     my $thispage    = $File->{Env}->{'Self URI'};
  3340     my $escaped_uri = uri_escape($File->{URI});
  3341     $thispage .= qq(?uri=$escaped_uri);
  3342     $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'};
  3343     $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'};
  3344     $thispage .= ';outline=1' if $File->{Opt}->{Outline};
  3345     $thispage .= ';No200=1' if $File->{Opt}->{No200};
  3346     $thispage .= ';verbose=1' if $File->{Opt}->{Verbose};
  3347     $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'};
  3348     $thispage .= ';accept=' . uri_escape($File->{Opt}->{'Accept Header'})
  3349         if $File->{Opt}->{'Accept Header'};
  3350     $thispage .=
  3351         ';accept-language=' .
  3352         uri_escape($File->{Opt}->{'Accept-Language Header'})
  3353         if $File->{Opt}->{'Accept-Language Header'};
  3354     $thispage .=
  3355         ';accept-charset=' .
  3356         uri_escape($File->{Opt}->{'Accept-Charset Header'})
  3357         if $File->{Opt}->{'Accept-Charset Header'};
  3358 
  3359     return $thispage;
  3360 }
  3361 
  3362 #####
  3363 
  3364 package W3C::Validator::EventHandler;
  3365 
  3366 #
  3367 # Define global constants
  3368 use constant TRUE  => 1;
  3369 use constant FALSE => 0;
  3370 
  3371 #
  3372 # Tentative Validation Severities.
  3373 use constant T_WARN  => 4;    # 0000 0100
  3374 use constant T_ERROR => 8;    # 0000 1000
  3375 
  3376 sub new
  3377 {
  3378     my $class  = shift;
  3379     my $parser = shift;
  3380     my $File   = shift;
  3381     my $CFG    = shift;
  3382     my $self   = {_file => $File, CFG => $CFG, _parser => $parser};
  3383     bless $self, $class;
  3384 }
  3385 
  3386 sub start_element
  3387 {
  3388     my ($self, $element) = @_;
  3389 
  3390     my $has_xmlns   = FALSE;
  3391     my $xmlns_value = undef;
  3392 
  3393     # If in XML mode, find namespace used for each element.
  3394     if ((my $attr = $element->{Attributes}->{xmlns}) &&
  3395         &W3C::Validator::MarkupValidator::is_xml($self->{_file}))
  3396     {
  3397         $xmlns_value = "";
  3398 
  3399         # Try with SAX method
  3400         if ($attr->{Value}) {
  3401             $has_xmlns   = TRUE;
  3402             $xmlns_value = $attr->{Value};
  3403         }
  3404 
  3405         #next if $has_xmlns;
  3406 
  3407         # The following is not SAX, but OpenSP specific.
  3408         my $defaulted = $attr->{Defaulted} || '';
  3409         if ($defaulted eq "specified") {
  3410             $has_xmlns = TRUE;
  3411             $xmlns_value .=
  3412                 join("", map { $_->{Data} } @{$attr->{CdataChunks}});
  3413         }
  3414     }
  3415 
  3416     my $doctype = $self->{_file}->{DOCTYPE};
  3417 
  3418     if (!defined($self->{CFG}->{Types}->{$doctype}->{Name}) ||
  3419         $element->{Name} ne $self->{CFG}->{Types}->{$doctype}->{Name})
  3420     {
  3421 
  3422         # add to list of non-root namespaces
  3423         push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns;
  3424     }
  3425     elsif (!$has_xmlns &&
  3426         $self->{CFG}->{Types}->{$doctype}->{"Namespace Required"})
  3427     {
  3428 
  3429         # whine if the root xmlns attribute is noted as required by spec,
  3430         # but not present
  3431         my $err      = {};
  3432         my $location = $self->{_parser}->get_location();
  3433         &W3C::Validator::MarkupValidator::set_error_uri($err,
  3434             $location->{FileName});
  3435 
  3436         # S::P::O does not provide src context, set to empty for non-doc errors.
  3437         $err->{src}  = "" if $err->{uri};
  3438         $err->{line} = $location->{LineNumber};
  3439         $err->{char} = $location->{ColumnNumber};
  3440         $err->{num}  = "no-xmlns";
  3441         $err->{type} = "E";
  3442         $err->{msg} =
  3443             "Missing xmlns attribute for element $element->{Name}. The " .
  3444             "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}";
  3445 
  3446         # ...
  3447         $self->{_file}->{'Is Valid'} = FALSE;
  3448         push @{$self->{_file}->{Errors}}, $err;
  3449     }
  3450     elsif ($has_xmlns and
  3451         (defined $self->{CFG}->{Types}->{$doctype}->{Namespace}) and
  3452         ($xmlns_value ne $self->{CFG}->{Types}->{$doctype}->{Namespace}))
  3453     {
  3454 
  3455         # whine if root xmlns element is not the one specificed by the spec
  3456         my $err      = {};
  3457         my $location = $self->{_parser}->get_location();
  3458         &W3C::Validator::MarkupValidator::set_error_uri($err,
  3459             $location->{FileName});
  3460 
  3461         # S::P::O does not provide src context, set to empty for non-doc errors.
  3462         $err->{line} = $location->{LineNumber};
  3463         $err->{char} = $location->{ColumnNumber};
  3464         $err->{num}  = "wrong-xmlns";
  3465         $err->{type} = "E";
  3466         $err->{msg} =
  3467             "Wrong xmlns attribute for element $element->{Name}. The " .
  3468             "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}";
  3469 
  3470         # ...
  3471         $self->{_file}->{'Is Valid'} = FALSE;
  3472         push @{$self->{_file}->{Errors}}, $err;
  3473     }
  3474 }
  3475 
  3476 sub error
  3477 {
  3478     my $self  = shift;
  3479     my $error = shift;
  3480     my $mess;
  3481     eval { $mess = $self->{_parser}->split_message($error); };
  3482     if ($@) {
  3483 
  3484         # this is a message that S:P:O could not handle, we skip its croaking
  3485         return;
  3486     }
  3487     my $File = $self->{_file};
  3488 
  3489     my $err = {};
  3490     &W3C::Validator::MarkupValidator::set_error_uri($err,
  3491         $self->{_parser}->get_location()->{FileName});
  3492 
  3493     # S::P::O does not provide src context, set to empty for non-doc errors.
  3494     $err->{src}  = "" if $err->{uri};
  3495     $err->{line} = $mess->{primary_message}{LineNumber};
  3496     $err->{char} = $mess->{primary_message}{ColumnNumber} + 1;
  3497     $err->{num}  = $mess->{primary_message}{Number};
  3498     $err->{type} = $mess->{primary_message}{Severity};
  3499     $err->{msg}  = $mess->{primary_message}{Text};
  3500 
  3501     # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware,
  3502     # so we filter out a few errors for now
  3503 
  3504     my $is_xml = &W3C::Validator::MarkupValidator::is_xml($File);
  3505 
  3506     if ($is_xml and $err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) {
  3507 
  3508         # the error is about a missing xmlns: attribute definition"
  3509         return;    # this is not an error, 'cause we said so
  3510     }
  3511 
  3512     if ($err->{num} eq '187')
  3513 
  3514         # filtering out no "document type declaration; will parse without
  3515         # validation" if root element is not html and mode is xml...
  3516     {
  3517 
  3518         # since parsing was done without validation, result can only be
  3519         # "well-formed"
  3520         if ($is_xml and lc($File->{Root}) ne 'html') {
  3521             $File->{XMLWF_ONLY} = TRUE;
  3522             W3C::Validator::MarkupValidator::add_warning('W09xml', {});
  3523             return;    # don't report this as an error, just proceed
  3524         }
  3525 
  3526         # if mode is not XML, we do report the error. It should not happen in
  3527         # the case of <html> without doctype, in that case the error message
  3528         # will be #344
  3529     }
  3530 
  3531     if (($err->{num} eq '113') and index($err->{msg}, 'xml:space') != -1) {
  3532 
  3533         # FIXME
  3534         # this is a problem with some of the "flattened" W3C DTDs, filtering
  3535         # them out to not confuse users. hoping to get the DTDs fixed, see
  3536         # http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html
  3537         return;    # don't report this, just proceed
  3538     }
  3539 
  3540     if ($is_xml and $err->{num} eq '344' and $File->{Namespace}) {
  3541 
  3542         # we are in XML mode, we have a namespace, but no doctype.
  3543         # the validator will already have said "no doctype, falling back to
  3544         # default" above
  3545         # no need to report this.
  3546         return;    # don't report this, just proceed
  3547     }
  3548 
  3549     if (($err->{num} eq '248') or
  3550         ($err->{num} eq '247') or
  3551         ($err->{num} eq '246'))
  3552     {
  3553 
  3554         # these two errors should be triggered by -wmin-tag to report shorttag
  3555         # used, but we're making them warnings, not errors
  3556         # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7
  3557         $err->{type} = "W";
  3558     }
  3559 
  3560     # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors
  3561     # beyond EOL.  If you see this warning in your web server logs, please
  3562     # let the validator developers know, see http://validator.w3.org/feedback.html
  3563     # As long as $err may be from somewhere else than the document (such as
  3564     # from a DTD) and we have no way of identifying these cases, this
  3565     # produces bogus results and error log spewage, so commented out for now.
  3566     #  if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) {
  3567     #    warn("Warning: reported error column larger than line length " .
  3568     #         "($err->{char} > $l) in $File->{URI} line $err->{line}, " .
  3569     #         "OpenSP bug? Resetting to line length.");
  3570     #    $err->{char} = $l;
  3571     #  }
  3572 
  3573     # No or unknown FPI and a relative SI.
  3574     if ($err->{msg} =~ m(cannot (?:open|find))) {
  3575         $File->{'Error Flagged'} = TRUE;
  3576         my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File);
  3577         $tmpl->param(fatal_parse_extid_error => TRUE);
  3578         $tmpl->param(fatal_parse_extid_msg   => $err->{msg});
  3579     }
  3580 
  3581     # No DOCTYPE found! We are falling back to vanilla DTD
  3582     if (index($err->{msg}, "prolog can't be omitted") != -1) {
  3583         if (lc($File->{Root}) eq 'html') {
  3584             my $dtd = $File->{"Default DOCTYPE"}->{$is_xml ? "XHTML" : "HTML"};
  3585             W3C::Validator::MarkupValidator::add_warning('W09',
  3586                 {W09_dtd => $dtd});
  3587         }
  3588         else {    # not html root element, we are not using fallback
  3589             unless ($is_xml) {
  3590                 $File->{'Is Valid'} = FALSE;
  3591                 W3C::Validator::MarkupValidator::add_warning('W09nohtml', {});
  3592             }
  3593         }
  3594 
  3595         return;    # Don't report this as a normal error.
  3596     }
  3597 
  3598     # TODO: calling exit() here is probably a bad idea
  3599     W3C::Validator::MarkupValidator::abort_if_error_flagged($File);
  3600 
  3601     push @{$File->{Errors}}, $err;
  3602 
  3603     # ...
  3604     $File->{'Is Valid'} = FALSE if $err->{type} eq 'E';
  3605 
  3606     if (defined $mess->{aux_message}) {
  3607 
  3608         # "duplicate id ... first defined here" style messages
  3609         push @{$File->{Errors}},
  3610             {
  3611             line => $mess->{aux_message}{LineNumber},
  3612             char => $mess->{aux_message}{ColumnNumber} + 1,
  3613             msg  => $mess->{aux_message}{Text},
  3614             type => 'I',
  3615             };
  3616     }
  3617 }
  3618 
  3619 package W3C::Validator::EventHandler::Outliner;
  3620 
  3621 #
  3622 # Define global constants
  3623 use constant TRUE  => 1;
  3624 use constant FALSE => 0;
  3625 
  3626 #
  3627 # Tentative Validation Severities.
  3628 use constant T_WARN  => 4;    # 0000 0100
  3629 use constant T_ERROR => 8;    # 0000 1000
  3630 
  3631 use base qw(W3C::Validator::EventHandler);
  3632 
  3633 sub new
  3634 {
  3635     my $class  = shift;
  3636     my $parser = shift;
  3637     my $File   = shift;
  3638     my $CFG    = shift;
  3639     my $self   = $class->SUPER::new($parser, $File, $CFG);
  3640     $self->{am_in_heading} = 0;
  3641     $self->{heading_text}  = [];
  3642     bless $self, $class;
  3643 }
  3644 
  3645 sub data
  3646 {
  3647     my ($self, $chars) = @_;
  3648     push(@{$self->{heading_text}}, $chars->{Data}) if $self->{am_in_heading};
  3649 }
  3650 
  3651 sub start_element
  3652 {
  3653     my ($self, $element) = @_;
  3654     if ($element->{Name} =~ /^h([1-6])$/i) {
  3655         $self->{_file}->{heading_outline} ||= "";
  3656         $self->{_file}->{heading_outline} .=
  3657             "    " x int($1) . "[$element->{Name}] ";
  3658         $self->{am_in_heading} = 1;
  3659     }
  3660 
  3661     return $self->SUPER::start_element($element);
  3662 }
  3663 
  3664 sub end_element
  3665 {
  3666     my ($self, $element) = @_;
  3667     if ($element->{Name} =~ /^h[1-6]$/i) {
  3668         my $text = join("", @{$self->{heading_text}});
  3669         $text =~ s/^\s+//g;
  3670         $text =~ s/\s+/ /g;
  3671         $text =~ s/\s+$//g;
  3672         $self->{_file}->{heading_outline} .= "$text\n";
  3673         $self->{am_in_heading} = 0;
  3674         $self->{heading_text}  = [];
  3675     }
  3676 }
  3677 
  3678 #####
  3679 
  3680 package W3C::Validator::UserAgent;
  3681 
  3682 use HTTP::Message qw();
  3683 use LWP::UserAgent 2.032 qw();    # Need 2.032 for default_header()
  3684 use Net::hostent qw(gethostbyname);
  3685 use Net::IP qw();
  3686 use Socket qw(inet_ntoa);
  3687 
  3688 use base qw(LWP::UserAgent);
  3689 
  3690 BEGIN {
  3691 
  3692     # The 4k default line length in LWP <= 5.832 isn't enough for example to
  3693     # accommodate 4kB cookies (RFC 2985); bump it (#6678).
  3694     require LWP::Protocol::http;
  3695     push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8 * 1024);
  3696 }
  3697 
  3698 sub new
  3699 {
  3700     my ($proto, $CFG, $File, @rest) = @_;
  3701     my $class = ref($proto) || $proto;
  3702     my $self = $class->SUPER::new(@rest);
  3703 
  3704     $self->{'W3C::Validator::CFG'}  = $CFG;
  3705     $self->{'W3C::Validator::File'} = $File;
  3706 
  3707     $self->env_proxy();
  3708     $self->agent($File->{Opt}->{'User Agent'});
  3709     $self->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']);
  3710 
  3711     # Don't parse the http-equiv stuff.
  3712     $self->parse_head(0);
  3713 
  3714     # Tell caches in the middle we want a fresh copy (Bug 4998).
  3715     $self->default_header('Cache-Control' => 'max-age=0');
  3716 
  3717     # If not in debug mode, set Accept-Encoding to what LWP (>= 5.816) can handle
  3718     $self->default_header(
  3719         'Accept-Encoding' => scalar HTTP::Message::decodable())
  3720         if (!$File->{Opt}->{Debug} && HTTP::Message->can('decodable'));
  3721 
  3722     # Our timeout should be set to something lower than the web server's,
  3723     # remembering to give some head room for the actual validation to take
  3724     # place after the document has been fetched (something like 15 seconds
  3725     # should be plenty).  validator.w3.org instances have their timeout set
  3726     # to 60 seconds as of writing this (#4985, #6950).
  3727     $self->timeout(45);
  3728 
  3729     return $self;
  3730 }
  3731 
  3732 sub redirect_ok
  3733 {
  3734     my ($self, $req, $res) = @_;
  3735     return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri());
  3736 }
  3737 
  3738 sub uri_ok
  3739 {
  3740     my ($self, $uri) = @_;
  3741 
  3742     return 1
  3743         if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} ||
  3744         !$uri->can('host'));
  3745 
  3746     my $h5uri = $self->{'W3C::Validator::CFG'}->{External}->{HTML5};
  3747     if ($h5uri) {
  3748         my $clone = $uri->clone();
  3749         $clone->query(undef);
  3750         $clone->fragment(undef);
  3751         $h5uri = URI->new($h5uri);
  3752         $h5uri->query(undef);
  3753         $h5uri->fragment(undef);
  3754         return 1 if $clone->eq($h5uri);
  3755     }
  3756 
  3757     my $addr = my $iptype = undef;
  3758     if (my $host = gethostbyname($uri->host())) {
  3759         $addr = inet_ntoa($host->addr()) if $host->addr();
  3760         if ($addr && (my $ip = Net::IP->new($addr))) {
  3761             $iptype = $ip->iptype();
  3762         }
  3763     }
  3764     if ($iptype && $iptype ne 'PUBLIC') {
  3765         my $File = $self->{'W3C::Validator::File'};
  3766         $File->{'Error Flagged'} = 1;
  3767         my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File);
  3768         $tmpl->param(fatal_ip_error    => 1);
  3769         $tmpl->param(fatal_ip_host     => $uri->host() || 'undefined');
  3770         $tmpl->param(fatal_ip_hostname => 1)
  3771             if ($addr and $uri->host() ne $addr);
  3772         return 0;
  3773     }
  3774     return 1;
  3775 }
  3776 
  3777 # Local Variables:
  3778 # mode: perl
  3779 # indent-tabs-mode: nil
  3780 # cperl-indent-level: 4
  3781 # cperl-continued-statement-offset: 4
  3782 # cperl-brace-offset: -4
  3783 # perl-indent-level: 4
  3784 # End:
  3785 # ex: ts=4 sw=4 et