Add support for checking links in CSS.
authorville
Sun, 07 Mar 2010 17:16:14 +0000
changeset 352bb26d957084a
parent 351 dfdcf2d8782d
child 353 11ba4cf8338c
Add support for checking links in CSS.

Known issue: no support for line numbers yet.
Makefile.PL
bin/checklink
docs/checklink.html
     1.1 --- a/Makefile.PL	Sun Mar 07 16:34:35 2010 +0000
     1.2 +++ b/Makefile.PL	Sun Mar 07 17:16:14 2010 +0000
     1.3 @@ -10,15 +10,20 @@
     1.4      PREREQ_PM    => {
     1.5  
     1.6          # Hard dependencies:
     1.7 -        HTML::Entities => 0,
     1.8 -        HTML::Parser   => 3.20,
     1.9 -        HTTP::Request  => 0,
    1.10 -        HTTP::Response => 1.50,
    1.11 -        LWP::RobotUA   => 1.19,
    1.12 -        LWP::UserAgent => 0,
    1.13 -        Time::HiRes    => 0,
    1.14 -        URI            => 1.31,
    1.15 -        URI::Escape    => 0,
    1.16 +        CSS::DOM            => 0.09,
    1.17 +        CSS::DOM::Constants => 0,
    1.18 +        CSS::DOM::Style     => 0,
    1.19 +        CSS::DOM::Util      => 0,
    1.20 +        HTML::Entities      => 0,
    1.21 +        HTML::Parser        => 3.20,
    1.22 +        HTTP::Headers::Util => 0,
    1.23 +        HTTP::Request       => 0,
    1.24 +        HTTP::Response      => 1.50,
    1.25 +        LWP::RobotUA        => 1.19,
    1.26 +        LWP::UserAgent      => 0,
    1.27 +        Time::HiRes         => 0,
    1.28 +        URI                 => 1.31,
    1.29 +        URI::Escape         => 0,
    1.30  
    1.31          # Optional, but required if using a config file:
    1.32          Config::General => 2.06,
     2.1 --- a/bin/checklink	Sun Mar 07 16:34:35 2010 +0000
     2.2 +++ b/bin/checklink	Sun Mar 07 17:16:14 2010 +0000
     2.3 @@ -5,7 +5,7 @@
     2.4  # (c) 1999-2010 World Wide Web Consortium
     2.5  # based on Renaud Bruyeron's checklink.pl
     2.6  #
     2.7 -# $Id: checklink,v 4.187 2010-03-07 16:34:34 ville Exp $
     2.8 +# $Id: checklink,v 4.188 2010-03-07 17:16:13 ville Exp $
     2.9  #
    2.10  # This program is licensed under the W3C(r) Software License:
    2.11  #       http://www.w3.org/Consortium/Legal/copyright-software
    2.12 @@ -219,10 +219,15 @@
    2.13  package W3C::LinkChecker;
    2.14  
    2.15  use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
    2.16 -    $DocType $Head $Accept $ContentTypes %Cfg);
    2.17 -
    2.18 +    $DocType $Head $Accept $ContentTypes %Cfg $CssUrl);
    2.19 +
    2.20 +use CSS::DOM 0.09 qw();    # >= 0.09 for many bugfixes
    2.21 +use CSS::DOM::Constants qw(:rule);
    2.22 +use CSS::DOM::Style qw();
    2.23 +use CSS::DOM::Util qw();
    2.24  use HTML::Entities qw();
    2.25 -use HTML::Parser 3.20 qw();      # >= 3.20 for "line" argspec identifier
    2.26 +use HTML::Parser 3.20 qw();    # >= 3.20 for "line" argspec identifier
    2.27 +use HTTP::Headers::Util qw();
    2.28  use HTTP::Request qw();
    2.29  use HTTP::Response 1.50 qw();    # >= 1.50 for decoded_content()
    2.30  use Time::HiRes qw();
    2.31 @@ -301,7 +306,7 @@
    2.32      $PROGRAM  = 'W3C-checklink';
    2.33      $VERSION  = '4.5';
    2.34      $REVISION = sprintf('version %s (c) 1999-2010 W3C', $VERSION);
    2.35 -    my ($cvsver) = q$Revision: 4.187 $ =~ /(\d+[\d\.]*\.\d+)/;
    2.36 +    my ($cvsver) = q$Revision: 4.188 $ =~ /(\d+[\d\.]*\.\d+)/;
    2.37      $AGENT = sprintf(
    2.38          '%s/%s [%s] %s',
    2.39          $PROGRAM, $VERSION, $cvsver,
    2.40 @@ -322,9 +327,13 @@
    2.41          application/vnd.wap.xhtml+xml;q=0.6
    2.42      );
    2.43      $Accept = join(', ', @content_types, '*/*;q=0.5');
    2.44 +    push(@content_types, "text/css");
    2.45      my $re = join('|', map { s/;.*//; quotemeta } @content_types);
    2.46      $ContentTypes = qr{\b(?:$re)\b}io;
    2.47  
    2.48 +    # Regexp for matching URL values in CSS.
    2.49 +    $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/;
    2.50 +
    2.51      #
    2.52      # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
    2.53      # been set or the default contains a non-empty file, read it.  Otherwise,
    2.54 @@ -1145,13 +1154,16 @@
    2.55                  $result_anchor);
    2.56              my $esc_uri =
    2.57                  URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
    2.58 +            print "<p>For reliable link checking results, check ";
    2.59 +
    2.60 +            if (!$response->{IsCss}) {
    2.61 +                printf("<a href=\"%s\">HTML validity</a> and ",
    2.62 +                    &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)));
    2.63 +            }
    2.64              printf(
    2.65 -                "<p>For reliable link checking results, check
    2.66 -<a href=\"%s\">HTML validity</a> first.  See also
    2.67 -<a href=\"%s\">CSS validity</a>.</p>
    2.68 +                "<a href=\"%s\">CSS validity</a> first.</p>
    2.69  <p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
    2.70 -                &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)),
    2.71 -                &encode(sprintf($Cfg{CSS_Validator_URI},    $esc_uri)),
    2.72 +                &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
    2.73                  &encode($Opts{_Self_URI})
    2.74              );
    2.75  
    2.76 @@ -1536,6 +1548,18 @@
    2.77      return 0;    # We always have at least one base location, but none matched.
    2.78  }
    2.79  
    2.80 +#################################
    2.81 +# Check for content type match. #
    2.82 +#################################
    2.83 +
    2.84 +sub is_content_type ($$)
    2.85 +{
    2.86 +    my ($candidate, $type) = @_;
    2.87 +    return 0 unless ($candidate && $type);
    2.88 +    my @v = HTTP::Headers::Util::split_header_words($candidate);
    2.89 +    return scalar(@v) ? $type eq lc($v[0]->[0]) : 0;
    2.90 +}
    2.91 +
    2.92  ##################################################
    2.93  # Check whether a URI has already been processed #
    2.94  ##################################################
    2.95 @@ -1666,6 +1690,8 @@
    2.96      &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
    2.97          if $verbose_progress;
    2.98  
    2.99 +    $response->{IsCss} =
   2.100 +        is_content_type($response->content_type(), "text/css");
   2.101      $response->{Realm} = $realm if defined($realm);
   2.102  
   2.103      return $response;
   2.104 @@ -1767,9 +1793,13 @@
   2.105          return $p;
   2.106      }
   2.107  
   2.108 -    my $start;
   2.109      $p = W3C::LinkChecker->new();
   2.110      $p->{base} = $base_uri;
   2.111 +
   2.112 +    my $stype = $response->header("Content-Style-Type");
   2.113 +    $p->{style_is_css} = !$stype || is_content_type($stype, "text/css");
   2.114 +
   2.115 +    my $start;
   2.116      if (!$Opts{Summary_Only}) {
   2.117          $start = &get_timestamp();
   2.118          print("Parsing...\n");
   2.119 @@ -1778,26 +1808,40 @@
   2.120      # Content-Encoding etc already decoded in get_document().
   2.121      my $docref = $response->content_ref();
   2.122  
   2.123 -    # Count lines beforehand if needed for progress indicator.  In all cases,
   2.124 -    # the actual final number of lines processed shown is populated by our
   2.125 +    # Count lines beforehand if needed (for progress indicator, or CSS while
   2.126 +    # we don't get any line context out of the parser).  In case of HTML, the
   2.127 +    # actual final number of lines processed shown is populated by our
   2.128      # end_document handler.
   2.129 -    $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress};
   2.130 +    $p->{Total} = ($$docref =~ tr/\n//)
   2.131 +        if ($response->{IsCss} || $Opts{Progress});
   2.132  
   2.133      # We only look for anchors if we are not interested in the links
   2.134      # obviously, or if we are running a recursive checking because we
   2.135      # might need this information later
   2.136      $p->{only_anchors} = !($links || $rec_needs_links);
   2.137  
   2.138 -    # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
   2.139 -    # Processing instructions are not parsed by process, but in this case
   2.140 -    # it should be. It's expensive, it's horrible, but it's the easiest way
   2.141 -    # for right now.
   2.142 -    $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
   2.143 -        unless $p->{only_anchors};
   2.144 -
   2.145 -    $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
   2.146 -
   2.147 -    $p->parse($$docref)->eof();
   2.148 +    if ($response->{IsCss}) {
   2.149 +
   2.150 +        # Parse as CSS
   2.151 +
   2.152 +        $p->parse_css($$docref, LINE_UNKNOWN());
   2.153 +    }
   2.154 +    else {
   2.155 +
   2.156 +        # Parse as HTML
   2.157 +
   2.158 +        # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
   2.159 +        # Processing instructions are not parsed by process, but in this case
   2.160 +        # it should be. It's expensive, it's horrible, but it's the easiest way
   2.161 +        # for right now.
   2.162 +        $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
   2.163 +            unless $p->{only_anchors};
   2.164 +
   2.165 +        $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
   2.166 +
   2.167 +        $p->parse($$docref)->eof();
   2.168 +    }
   2.169 +
   2.170      $response->content("");
   2.171  
   2.172      if (!$Opts{Summary_Only}) {
   2.173 @@ -1827,6 +1871,8 @@
   2.174      # Set up handlers
   2.175  
   2.176      $p->handler(start => 'start', 'self, tagname, attr, line');
   2.177 +    $p->handler(end   => 'end',   'self, tagname, line');
   2.178 +    $p->handler(text  => 'text',  'self, dtext, line');
   2.179      $p->handler(
   2.180          declaration => sub {
   2.181              my $self = shift;
   2.182 @@ -2006,9 +2052,66 @@
   2.183                  }
   2.184              }
   2.185          }
   2.186 +
   2.187 +        # Inline CSS:
   2.188 +        delete $self->{csstext};
   2.189 +        if ($tag eq 'style') {
   2.190 +            $self->{csstext} = ''
   2.191 +                if ((!$attr->{type} && $self->{style_is_css}) ||
   2.192 +                is_content_type($attr->{type}, "text/css"));
   2.193 +        }
   2.194 +        elsif ($self->{style_is_css} && (my $style = $attr->{style})) {
   2.195 +            $style = CSS::DOM::Style::parse($style);
   2.196 +            $self->parse_style($style, $line);
   2.197 +        }
   2.198      }
   2.199  
   2.200      $self->parse_progress($line) if $Opts{Progress};
   2.201 +    return;
   2.202 +}
   2.203 +
   2.204 +sub end
   2.205 +{
   2.206 +    my ($self, $tagname, $line) = @_;
   2.207 +
   2.208 +    $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style');
   2.209 +    delete $self->{csstext};
   2.210 +
   2.211 +    $self->parse_progress($line) if $Opts{Progress};
   2.212 +    return;
   2.213 +}
   2.214 +
   2.215 +sub parse_css
   2.216 +{
   2.217 +    my ($self, $css, $line) = @_;
   2.218 +    return unless $css;
   2.219 +
   2.220 +    my $sheet = CSS::DOM::parse($css);
   2.221 +    for my $rule (@{$sheet->cssRules()}) {
   2.222 +        if ($rule->type() == IMPORT_RULE()) {
   2.223 +            $self->add_link($rule->href(), $self->{base}, $line);
   2.224 +        }
   2.225 +        elsif ($rule->type == STYLE_RULE()) {
   2.226 +            $self->parse_style($rule->style(), $line);
   2.227 +        }
   2.228 +    }
   2.229 +    return;
   2.230 +}
   2.231 +
   2.232 +sub parse_style
   2.233 +{
   2.234 +    my ($self, $style, $line) = @_;
   2.235 +    return unless $style;
   2.236 +
   2.237 +    for (my $i = 0, my $len = $style->length(); $i < $len; $i++) {
   2.238 +        my $prop = $style->item($i);
   2.239 +        my $val  = $style->getPropertyValue($prop);
   2.240 +
   2.241 +        while ($val =~ /$CssUrl/go) {
   2.242 +            my $url = CSS::DOM::Util::unescape($2);
   2.243 +            $self->add_link($url, $self->{base}, $line);
   2.244 +        }
   2.245 +    }
   2.246  
   2.247      return;
   2.248  }
   2.249 @@ -2044,10 +2147,19 @@
   2.250      return;
   2.251  }
   2.252  
   2.253 +sub text
   2.254 +{
   2.255 +    my ($self, $text, $line) = @_;
   2.256 +    $self->{csstext} .= $text if defined($self->{csstext});
   2.257 +    $self->parse_progress($line) if $Opts{Progress};
   2.258 +    return;
   2.259 +}
   2.260 +
   2.261  sub end_document
   2.262  {
   2.263      my ($self, $line) = @_;
   2.264      $self->{Total} = $line;
   2.265 +    delete $self->{csstext};
   2.266      return;
   2.267  }
   2.268  
   2.269 @@ -2986,10 +3098,17 @@
   2.270          if ($doc_count > 0 && !$Opts{Quiet});
   2.271      if (!$doc_count) {
   2.272          print <<'EOF';
   2.273 -  <div class="intro">
   2.274 -      <p>This Link Checker looks for issues in links, anchors and referenced objects in a Web page, or recursively on a whole Web site.
   2.275 -      For best results, it is recommended to first ensure that the documents checked use <a href="http://validator.w3.org/">Valid (X)HTML Markup</a>. The Link Checker is part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and Quality Web tools</a>.</p>
   2.276 -  </div>
   2.277 +<div class="intro">
   2.278 +  <p>
   2.279 +    This Link Checker looks for issues in links, anchors and referenced objects
   2.280 +    in a Web page, CSS style sheet, or recursively on a whole Web site. For
   2.281 +    best results, it is recommended to first ensure that the documents checked
   2.282 +    use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and
   2.283 +    <a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is
   2.284 +    part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and
   2.285 +    Quality Web tools</a>.
   2.286 +  </p>
   2.287 +</div>
   2.288  EOF
   2.289      }
   2.290      printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
     3.1 --- a/docs/checklink.html	Sun Mar 07 16:34:35 2010 +0000
     3.2 +++ b/docs/checklink.html	Sun Mar 07 17:16:14 2010 +0000
     3.3 @@ -6,7 +6,7 @@
     3.4      <title>W3C Link Checker Documentation</title>
     3.5      <link rev="made" href="mailto:www-validator@w3.org" />
     3.6      <style type="text/css" media="all">@import "linkchecker.css";</style>
     3.7 -    <meta name="revision" content="$Id: checklink.html,v 1.59 2010-03-07 16:34:35 ville Exp $" />
     3.8 +    <meta name="revision" content="$Id: checklink.html,v 1.60 2010-03-07 17:16:14 ville Exp $" />
     3.9    </head>
    3.10  
    3.11    <body>
    3.12 @@ -54,8 +54,8 @@
    3.13      <h2><a name="what" id="what">What it does</a></h2>
    3.14  
    3.15      <p>
    3.16 -      The link checker reads an HTML or XHTML document and extracts a list
    3.17 -      of anchors and links.
    3.18 +      The link checker reads an HTML or XHTML document or a CSS style sheet
    3.19 +      and extracts a list of anchors and links.
    3.20      </p>
    3.21  
    3.22      <p>
    3.23 @@ -149,6 +149,7 @@
    3.24            <li><a href="http://search.cpan.org/dist/W3C-LinkChecker/">W3C-LinkChecker</a> (the link checker itself)</li>
    3.25  	  <li><a href="http://search.cpan.org/dist/CGI.pm/">CGI.pm</a> (required for CGI mode only)</li>
    3.26            <li><a href="http://search.cpan.org/dist/Config-General/">Config-General</a> (optional, version 2.06 or newer; required only for reading the (optional) configuration file)</li>
    3.27 +          <li><a href="http://search.cpan.org/dist/CSS-DOM/">CSS-DOM</a> (version 0.09 or newer)</li>
    3.28  	  <li><a href="http://search.cpan.org/dist/HTML-Parser/">HTML-Parser</a> (version 3.20 or newer)</li>
    3.29  	  <li><a href="http://search.cpan.org/dist/libwww-perl/">libwww-perl</a> (version 5.802 or newer)</li>
    3.30            <li><a href="http://search.cpan.org/dist/Net-IP/">Net-IP</a> (optional but recommended; required for restricting access to <a href="http://www.ietf.org/rfc/rfc1918.txt">private IP addresses</a>)</li>
    3.31 @@ -325,7 +326,7 @@
    3.32      <address>
    3.33        <a title="Send Feedback for the W3C Link Checker"
    3.34          href="http://validator.w3.org/feedback.html">The W3C Validator Team</a><br />
    3.35 -      $Date: 2010-03-07 16:34:35 $
    3.36 +      $Date: 2010-03-07 17:16:14 $
    3.37      </address>
    3.38      <p class="copyright">
    3.39        <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &copy; 1994-2010