CSS is a language for applying styles to HTML documents. It defines selectors to associate those styles with specific HTML elements.
XPath is a language for selecting nodes in XML documents, which can also be used with HTML.
XPath expressions are very powerful, and are the foundation of Web Crawler Selectors. In fact, CSS selectors are converted to XPath under-the-hood.
While perhaps not as popular as CSS selectors, XPath expressions offer more power because besides navigating the structure, it can also look at the content.
Bing Picture e.g.
You can employ XPath expressions to scrapy Bing pictures, e.g.
from lxml import etree html_doc = """ <!DOCTYPE html> <!-- This is a html file. --> <html lang="en"><head><meta charset="UTF-8" /><title>Alice in Wonderland </title></head> <body> <h1>Alice in Wonderland</h1> <p class="title" id="dormouse"><b>The Dormouse's story!!!</b></p> <pre>start <span>initialize the story</span><i> end</i></pre> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">... """ html = etree.HTML(html_doc) # html = etree.parse('Alice.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8')) # <html lang="en"><head><meta charset="UTF-8"/><title>Alice in Wonderland </title></head> # <body> # <h1>Alice in Wonderland</h1> # <p class="title" id="dormouse"><b>The Dormouse's story!!!</b></p> # <pre>start <span>initialize the story</span><i> end</i></pre> # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # <p class="story">... # </p></body></html>
print(html.xpath('//p')) # [<Element p at 0x4af1d00>, <Element p at 0x4b23030>, <Element p at 0x4b230a8>] print(html.xpath('//p[@class="title"]//text()')) # ["The Dormouse's story!!!"] print(html.xpath('//a[@id="link1"]/../@class')) print(html.xpath('//a[@id="link1"]/parent::*/@class')) # ['story'] print(html.xpath('//a[@class="sister"]/text()')) # [] print(html.xpath('//a[contains(@class, "sister")]/text()')) # ['Elsie', 'Lacie', 'Tillie']
print(html.xpath('//p[1]/ancestor::*')) # [<Element html at 0x4b35cb0>, <Element body at 0x4b35210>] print(html.xpath('//p[1]/attribute::*')) # ['title', 'dormouse'] print(html.xpath('//p[2]/child::a[@id="link1"]/text()')) # ['Elsie'] print(html.xpath('//body/descendant::a[@id="link1"]/text()')) # ['Elsie'] print(html.xpath('//body/child::a[@id="link1"]/text()')) # [] print(html.xpath('//p[1]/following::*[2]/text()')) # ['initialize the story'] print(html.xpath('//p[1]/following-sibling::*')) # [<Element pre at 0x4da7df0>, <Element p at 0x4da7ee0>, <Element p at 0x4da7490>]
html_doc = """ <!DOCTYPE html> <!-- This is a html file. --> <html lang="en"><head><meta charset="UTF-8" /><title>Alice in Wonderland </title></head> <body> <h1>Alice in Wonderland</h1> <p class="title" id="dormouse"><b>The Dormouse's story!!!</b></p> <pre>start <span>initialize the story</span><i> end</i></pre> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">... """ from pyquery import PyQuery as pq doc = pq(html_doc) # doc = pq(URL) # doc=pq(filename='Alice.html') print(doc)
# <html lang="en"><head><meta charset="UTF-8"/><title>Alice in Wonderland </title></head> # <body> # <h1>Alice in Wonderland</h1> # <p class="title" id="dormouse"><b>The Dormouse's story!!!</b></p> # <pre>start <span>initialize the story</span><i> end</i></pre> # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # <p class="story">... # </p></body></html>
print(doc('p.story').find('a')) # <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well. print(doc('.sister')) # <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well. print(doc('p.story').find('a').attr('href')) print(doc('p.story').find('a').attr.href) # http://example.com/elsie print(doc('p.story').html()) # Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well. print(doc('p.story').find('a').html()) # Elsie print(doc('p.story').find('a').text()) # Elsie Lacie Tillie for item in doc('p.story a').items(): print(item.attr('href')) # http://example.com/elsie # http://example.com/lacie # http://example.com/tillie for item in doc('p.story a').items(): print(item.text()) # Elsie # Lacie # Tillie print(doc('p:last-child').text()) # ... print(doc('a:nth-child(2)')) print(doc('a:nth-child(2n)')) # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and print(doc('a:gt(1)')) # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well. print(doc('p.story').children()) # <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well. print(doc('p.story').children('#link2')) # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and print(doc('p.title b').parent()) # <p class="title" id="dormouse"><b>The Dormouse's story!!!</b></p> print(doc('a#link1').siblings()) # <p class="title" id="dormouse"><b>The Dormouse's story!!!</b></p>
# <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well. for ele in doc('.story').items(): print(ele, type(ele)) # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister 1" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister 2" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister 3" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # <class 'pyquery.pyquery.PyQuery'> # <p class="story">... # </p> <class 'pyquery.pyquery.PyQuery'>
story = doc('p.story').find('a') story.attr('class', 'sister') story.html("\n<b>I'm bold</b>") print(story) # <a href="http://example.com/elsie" class="sister" id="link1"> # <b>I'm bold</b></a><a href="http://example.com/lacie" class="sister" id="link2"> # <b>I'm bold</b></a><a href="http://example.com/tillie" class="sister" id="link3"> # <b>I'm bold</b></a> story.removeClass('sister') print(story) # <a href="http://example.com/elsie" class="" id="link1"> # <b>I'm bold</b></a><a href="http://example.com/lacie" class="" id="link2"> # <b>I'm bold</b></a><a href="http://example.com/tillie" class="" id="link3"> # <b>I'm bold</b></a> story.addClass('sister') story.text('We\'re makin\' a story\n') print(story) # <a href="http://example.com/elsie" class="sister" id="link1">We're the story # </a><a href="http://example.com/lacie" class="sister" id="link2">We're the story # </a><a href="http://example.com/tillie" class="sister" id="link3">We're the story # </a> text = doc('p.story') text.find('a').remove() print(text.text()) # Once upon a time there were three little sisters; and their names were ...