package Mojo::DOM; use Mojo::Base -strict; use overload '@{}' => sub { shift->child_nodes }, '%{}' => sub { shift->attr }, bool => sub {1}, '""' => sub { shift->to_string }, fallback => 1; # "Fry: This snow is beautiful. I'm glad global warming never happened. # Leela: Actually, it did. But thank God nuclear winter canceled it out." use Mojo::Collection; use Mojo::DOM::CSS; use Mojo::DOM::HTML; use Scalar::Util qw(blessed weaken); use Storable qw(dclone); sub all_text { _text(_nodes($_[0]->tree), $_[0]->xml, 1) } sub ancestors { _select($_[0]->_collect([_ancestors($_[0]->tree)]), $_[1]) } sub append { shift->_add(1, @_) } sub append_content { shift->_content(1, 0, @_) } sub at { my $self = shift; return undef unless my $result = $self->_css->select_one(@_); return $self->_build($result, $self->xml); } sub attr { my $self = shift; # Hash my $tree = $self->tree; my $attrs = $tree->[0] ne 'tag' ? {} : $tree->[2]; return $attrs unless @_; # Get return $attrs->{$_[0]} unless @_ > 1 || ref $_[0]; # Set my $values = ref $_[0] ? $_[0] : {@_}; @$attrs{keys %$values} = values %$values; return $self; } sub child_nodes { $_[0]->_collect(_nodes($_[0]->tree)) } sub children { _select($_[0]->_collect(_nodes($_[0]->tree, 1)), $_[1]) } sub content { my $self = shift; my $type = $self->type; if ($type eq 'root' || $type eq 'tag') { return $self->_content(0, 1, @_) if @_; my $html = Mojo::DOM::HTML->new(xml => $self->xml); return join '', map { $html->tree($_)->render } @{_nodes($self->tree)}; } return $self->tree->[1] unless @_; $self->tree->[1] = shift; return $self; } sub descendant_nodes { $_[0]->_collect(_all(_nodes($_[0]->tree))) } sub find { my $self = shift; return $self->_collect($self->_css->select(@_)); } sub following { _select($_[0]->_collect(_siblings($_[0]->tree, 1, 1)), $_[1]) } sub following_nodes { $_[0]->_collect(_siblings($_[0]->tree, 0, 1)) } sub matches { shift->_css->matches(@_) } sub namespace { my $self = shift; return undef if (my $tree = $self->tree)->[0] ne 'tag'; # Extract namespace prefix and search parents my $ns = $tree->[1] =~ /^(.*?):/ ? "xmlns:$1" : undef; for my $node ($tree, _ancestors($tree)) { # Namespace for prefix my $attrs = $node->[2]; if ($ns) { $_ eq $ns and return $attrs->{$_} for keys %$attrs } # Namespace attribute elsif (defined $attrs->{xmlns}) { return $attrs->{xmlns} } } return undef; } sub new { my $class = shift; my $self = bless \Mojo::DOM::HTML->new, ref $class || $class; return @_ ? $self->parse(@_) : $self; } sub new_tag { my $self = shift; my $new = $self->new; $$new->tag(@_); $$new->xml($$self->xml) if ref $self; return $new; } sub next { $_[0]->_maybe(_siblings($_[0]->tree, 1, 1, 0)) } sub next_node { $_[0]->_maybe(_siblings($_[0]->tree, 0, 1, 0)) } sub parent { my $self = shift; return undef if (my $tree = $self->tree)->[0] eq 'root'; return $self->_build(_parent($tree), $self->xml); } sub parse { ${$_[0]}->parse($_[1]) and return $_[0] } sub preceding { _select($_[0]->_collect(_siblings($_[0]->tree, 1, 0)), $_[1]) } sub preceding_nodes { $_[0]->_collect(_siblings($_[0]->tree, 0)) } sub prepend { shift->_add(0, @_) } sub prepend_content { shift->_content(0, 0, @_) } sub previous { $_[0]->_maybe(_siblings($_[0]->tree, 1, 0, -1)) } sub previous_node { $_[0]->_maybe(_siblings($_[0]->tree, 0, 0, -1)) } sub remove { shift->replace('') } sub replace { my ($self, $new) = @_; return $self->parse($new) if (my $tree = $self->tree)->[0] eq 'root'; return $self->_replace(_parent($tree), $tree, _nodes($self->_parse($new))); } sub root { my $self = shift; return $self unless my $tree = _ancestors($self->tree, 1); return $self->_build($tree, $self->xml); } sub selector { return undef unless (my $tree = shift->tree)->[0] eq 'tag'; return join ' > ', reverse map { $_->[1] . ':nth-child(' . (@{_siblings($_, 1)} + 1) . ')' } $tree, _ancestors($tree); } sub strip { my $self = shift; return $self if (my $tree = $self->tree)->[0] ne 'tag'; return $self->_replace($tree->[3], $tree, _nodes($tree)); } sub tag { my ($self, $tag) = @_; return undef if (my $tree = $self->tree)->[0] ne 'tag'; return $tree->[1] unless $tag; $tree->[1] = $tag; return $self; } sub tap { shift->Mojo::Base::tap(@_) } sub text { _text(_nodes(shift->tree), 0, 0) } sub to_string { ${shift()}->render } sub tree { @_ > 1 ? (${$_[0]}->tree($_[1]) and return $_[0]) : ${$_[0]}->tree } sub type { shift->tree->[0] } sub val { my $self = shift; # "option" return $self->{value} // $self->text if (my $tag = $self->tag) eq 'option'; # "input" ("type=checkbox" and "type=radio") my $type = $self->{type} // ''; return $self->{value} // 'on' if $tag eq 'input' && ($type eq 'radio' || $type eq 'checkbox'); # "textarea", "input" or "button" return $tag eq 'textarea' ? $self->text : $self->{value} if $tag ne 'select'; # "select" my $v = $self->find('option:checked:not([disabled])')->grep(sub { !$_->ancestors('optgroup[disabled]')->size }) ->map('val'); return exists $self->{multiple} ? $v->size ? $v->to_array : undef : $v->last; } sub with_roles { shift->Mojo::Base::with_roles(@_) } sub wrap { shift->_wrap(0, @_) } sub wrap_content { shift->_wrap(1, @_) } sub xml { @_ > 1 ? (${$_[0]}->xml($_[1]) and return $_[0]) : ${$_[0]}->xml } sub _add { my ($self, $offset, $new) = @_; return $self if (my $tree = $self->tree)->[0] eq 'root'; my $parent = _parent($tree); splice @$parent, _offset($parent, $tree) + $offset, 0, @{_link($parent, _nodes($self->_parse($new)))}; return $self; } sub _all { my $nodes = shift; @$nodes = map { $_->[0] eq 'tag' ? ($_, @{_all(_nodes($_))}) : ($_) } @$nodes; return $nodes; } sub _ancestors { my ($tree, $root) = @_; return () unless $tree = _parent($tree); my @ancestors; do { push @ancestors, $tree } while ($tree->[0] eq 'tag') && ($tree = $tree->[3]); return $root ? $ancestors[-1] : @ancestors[0 .. $#ancestors - 1]; } sub _build { shift->new->tree(shift)->xml(shift) } sub _collect { my ($self, $nodes) = (shift, shift // []); my $xml = $self->xml; return Mojo::Collection->new(map { $self->_build($_, $xml) } @$nodes); } sub _content { my ($self, $start, $offset, $new) = @_; my $tree = $self->tree; unless ($tree->[0] eq 'root' || $tree->[0] eq 'tag') { my $old = $self->content; return $self->content($start ? $old . $new : $new . $old); } $start = $start ? ($#$tree + 1) : _start($tree); $offset = $offset ? $#$tree : 0; splice @$tree, $start, $offset, @{_link($tree, _nodes($self->_parse($new)))}; return $self; } sub _css { Mojo::DOM::CSS->new(tree => shift->tree) } sub _fragment { _link(my $r = ['root', @_], [@_]); $r } sub _link { my ($parent, $children) = @_; # Link parent to children for my $node (@$children) { my $offset = $node->[0] eq 'tag' ? 3 : 2; $node->[$offset] = $parent; weaken $node->[$offset]; } return $children; } sub _maybe { $_[1] ? $_[0]->_build($_[1], $_[0]->xml) : undef } sub _nodes { return () unless my $tree = shift; my @nodes = @$tree[_start($tree) .. $#$tree]; return shift() ? [grep { $_->[0] eq 'tag' } @nodes] : \@nodes; } sub _offset { my ($parent, $child) = @_; my $i = _start($parent); $_ eq $child ? last : $i++ for @$parent[$i .. $#$parent]; return $i; } sub _parent { $_[0]->[$_[0][0] eq 'tag' ? 3 : 2] } sub _parse { my ($self, $input) = @_; return Mojo::DOM::HTML->new(xml => $self->xml)->parse($input)->tree unless blessed $input && $input->isa('Mojo::DOM'); my $tree = dclone $input->tree; return $tree->[0] eq 'root' ? $tree : _fragment($tree); } sub _replace { my ($self, $parent, $child, $nodes) = @_; splice @$parent, _offset($parent, $child), 1, @{_link($parent, $nodes)}; return $self->parent; } sub _select { $_[1] ? $_[0]->grep(matches => $_[1]) : $_[0] } sub _siblings { my ($tree, $tags, $tail, $i) = @_; return defined $i ? undef : [] if $tree->[0] eq 'root'; my $nodes = _nodes(_parent($tree)); my $match = -1; defined($match++) and $_ eq $tree and last for @$nodes; if ($tail) { splice @$nodes, 0, $match + 1 } else { splice @$nodes, $match, ($#$nodes + 1) - $match } @$nodes = grep { $_->[0] eq 'tag' } @$nodes if $tags; return defined $i ? $i == -1 && !@$nodes ? undef : $nodes->[$i] : $nodes; } sub _start { $_[0][0] eq 'root' ? 1 : 4 } sub _text { my ($nodes, $xml, $all) = @_; my $text = ''; while (my $node = shift @$nodes) { my $type = $node->[0]; # Text if ($type eq 'text' || $type eq 'cdata' || $type eq 'raw') { $text .= $node->[1] } # Nested tag elsif ($type eq 'tag' && $all) { unshift @$nodes, @{_nodes($node)} if $xml || ($node->[1] ne 'script' && $node->[1] ne 'style'); } } return $text; } sub _wrap { my ($self, $content, $new) = @_; return $self if (my $tree = $self->tree)->[0] eq 'root' && !$content; return $self if $tree->[0] ne 'root' && $tree->[0] ne 'tag' && $content; # Find innermost tag my $current; my $first = $new = $self->_parse($new); $current = $first while $first = _nodes($first, 1)->[0]; return $self unless $current; # Wrap content if ($content) { push @$current, @{_link($current, _nodes($tree))}; splice @$tree, _start($tree), $#$tree, @{_link($tree, _nodes($new))}; return $self; } # Wrap element $self->_replace(_parent($tree), $tree, _nodes($new)); push @$current, @{_link($current, [$tree])}; return $self; } 1; =encoding utf8 =head1 NAME Mojo::DOM - Minimalistic HTML/XML DOM parser with CSS selectors =head1 SYNOPSIS use Mojo::DOM; # Parse my $dom = Mojo::DOM->new('

Test

123

'); # Find say $dom->at('#b')->text; say $dom->find('p')->map('text')->join("\n"); say $dom->find('[id]')->map(attr => 'id')->join("\n"); # Iterate $dom->find('p[id]')->reverse->each(sub { say $_->{id} }); # Loop for my $e ($dom->find('p[id]')->each) { say $e->{id}, ':', $e->text; } # Modify $dom->find('div p')->last->append('

456

'); $dom->at('#c')->prepend($dom->new_tag('p', id => 'd', '789')); $dom->find(':not(p)')->map('strip'); # Render say "$dom"; =head1 DESCRIPTION L is a minimalistic and relaxed HTML/XML DOM parser with CSS selector support. It will even try to interpret broken HTML and XML, so you should not use it for validation. =head1 NODES AND ELEMENTS When we parse an HTML/XML fragment, it gets turned into a tree of nodes. Hello World! There are currently eight different kinds of nodes, C, C, C, C, C, C, C and C. Elements are nodes of the type C. root |- doctype (html) +- tag (html) |- tag (head) | +- tag (title) | +- raw (Hello) +- tag (body) +- text (World!) While all node types are represented as L objects, some methods like L and L only apply to elements. =head1 HTML AND XML L defaults to HTML semantics, that means all tags and attribute names are lowercased and selectors need to be lowercase as well. # HTML semantics my $dom = Mojo::DOM->new('

Hi!

'); say $dom->at('p[id]')->text; If an XML declaration is found, the parser will automatically switch into XML mode and everything becomes case-sensitive. # XML semantics my $dom = Mojo::DOM->new('

Hi!

'); say $dom->at('P[ID]')->text; HTML or XML semantics can also be forced with the L method. # Force HTML semantics my $dom = Mojo::DOM->new->xml(0)->parse('

Hi!

'); say $dom->at('p[id]')->text; # Force XML semantics my $dom = Mojo::DOM->new->xml(1)->parse('

Hi!

'); say $dom->at('P[ID]')->text; =head1 METHODS L implements the following methods. =head2 all_text my $text = $dom->all_text; Extract text content from all descendant nodes of this element. For HTML documents C