foo
\tbar\n \n
baz ")
# Everything outside the
tag is reformatted, but everything
# inside is left alone.
self.assertEqual(
u'\n foo\n
\tbar\n \n
\n baz\n
',
soup.div.prettify())
def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("foo")
pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty)
def test_prettify_outputs_unicode_by_default(self):
soup = self.soup("")
self.assertEqual(unicode, type(soup.prettify()))
def test_prettify_can_encode_data(self):
soup = self.soup("")
self.assertEqual(bytes, type(soup.prettify("utf-8")))
def test_html_entity_substitution_off_by_default(self):
markup = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!"
soup = self.soup(markup)
encoded = soup.b.encode("utf-8")
self.assertEqual(encoded, markup.encode('utf-8'))
def test_encoding_substitution(self):
# Here's the tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('')
soup = self.soup(meta_tag)
# Parse the document, and the charset apprears unchanged.
self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
# Encode the document into some encoding, and the encoding is
# substituted into the meta tag.
utf_8 = soup.encode("utf-8")
self.assertTrue(b"charset=utf-8" in utf_8)
euc_jp = soup.encode("euc_jp")
self.assertTrue(b"charset=euc_jp" in euc_jp)
shift_jis = soup.encode("shift-jis")
self.assertTrue(b"charset=shift-jis" in shift_jis)
utf_16_u = soup.encode("utf-16").decode("utf-16")
self.assertTrue("charset=utf-16" in utf_16_u)
def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
markup = ('foo
')
# Beautiful Soup used to try to rewrite the meta tag even if the
# meta tag got filtered out by the strainer. This test makes
# sure that doesn't happen.
strainer = SoupStrainer('pre')
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(soup.contents[0].name, 'pre')
class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings."""
def test_unicode_string_can_be_encoded(self):
html = u"\N{SNOWMAN}"
soup = self.soup(html)
self.assertEqual(soup.b.string.encode("utf-8"),
u"\N{SNOWMAN}".encode("utf-8"))
def test_tag_containing_unicode_string_can_be_encoded(self):
html = u"\N{SNOWMAN}"
soup = self.soup(html)
self.assertEqual(
soup.b.encode("utf-8"), html.encode("utf-8"))
def test_encoding_substitutes_unrecognized_characters_by_default(self):
html = u"\N{SNOWMAN}"
soup = self.soup(html)
self.assertEqual(soup.b.encode("ascii"), b"☃")
def test_encoding_can_be_made_strict(self):
html = u"\N{SNOWMAN}"
soup = self.soup(html)
self.assertRaises(
UnicodeEncodeError, soup.encode, "ascii", errors="strict")
def test_decode_contents(self):
html = u"\N{SNOWMAN}"
soup = self.soup(html)
self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
def test_encode_contents(self):
html = u"\N{SNOWMAN}"
soup = self.soup(html)
self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
encoding="utf8"))
def test_deprecated_renderContents(self):
html = u"\N{SNOWMAN}"
soup = self.soup(html)
self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
class TestNavigableStringSubclasses(SoupTest):
def test_cdata(self):
# None of the current builders turn CDATA sections into CData
# objects, but you can create them manually.
soup = self.soup("")
cdata = CData("foo")
soup.insert(1, cdata)
self.assertEqual(str(soup), "")
self.assertEqual(soup.find(text="foo"), "foo")
self.assertEqual(soup.contents[0], "foo")
def test_cdata_is_never_formatted(self):
"""Text inside a CData object is passed into the formatter.
But the return value is ignored.
"""
self.count = 0
def increment(*args):
self.count += 1
return "BITTER FAILURE"
soup = self.soup("")
cdata = CData("<><><>")
soup.insert(1, cdata)
self.assertEqual(
b"<><>]]>", soup.encode(formatter=increment))
self.assertEqual(1, self.count)
def test_doctype_ends_in_newline(self):
# Unlike other NavigableString subclasses, a DOCTYPE always ends
# in a newline.
doctype = Doctype("foo")
soup = self.soup("")
soup.insert(1, doctype)
self.assertEqual(soup.encode(), b"\n")
class TestSoupSelector(TreeTest):
HTML = """
The title
English
English UK
English US
French
"""
def setUp(self):
self.soup = BeautifulSoup(self.HTML)
def assertSelects(self, selector, expected_ids):
el_ids = [el['id'] for el in self.soup.select(selector)]
el_ids.sort()
expected_ids.sort()
self.assertEqual(expected_ids, el_ids,
"Selector %s, expected [%s], got [%s]" % (
selector, ', '.join(expected_ids), ', '.join(el_ids)
)
)
assertSelect = assertSelects
def assertSelectMultiple(self, *tests):
for selector, expected_ids in tests:
self.assertSelect(selector, expected_ids)
def test_one_tag_one(self):
els = self.soup.select('title')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].name, 'title')
self.assertEqual(els[0].contents, [u'The title'])
def test_one_tag_many(self):
els = self.soup.select('div')
self.assertEqual(len(els), 3)
for div in els:
self.assertEqual(div.name, 'div')
def test_tag_in_tag_one(self):
els = self.soup.select('div div')
self.assertSelects('div div', ['inner'])
def test_tag_in_tag_many(self):
for selector in ('html div', 'html body div', 'body div'):
self.assertSelects(selector, ['main', 'inner', 'footer'])
def test_tag_no_match(self):
self.assertEqual(len(self.soup.select('del')), 0)
def test_invalid_tag(self):
self.assertRaises(ValueError, self.soup.select, 'tag%t')
def test_header_tags(self):
self.assertSelectMultiple(
('h1', ['header1']),
('h2', ['header2', 'header3']),
)
def test_class_one(self):
for selector in ('.onep', 'p.onep', 'html p.onep'):
els = self.soup.select(selector)
self.assertEqual(len(els), 1)
self.assertEqual(els[0].name, 'p')
self.assertEqual(els[0]['class'], ['onep'])
def test_class_mismatched_tag(self):
els = self.soup.select('div.onep')
self.assertEqual(len(els), 0)
def test_one_id(self):
for selector in ('div#inner', '#inner', 'div div#inner'):
self.assertSelects(selector, ['inner'])
def test_bad_id(self):
els = self.soup.select('#doesnotexist')
self.assertEqual(len(els), 0)
def test_items_in_id(self):
els = self.soup.select('div#inner p')
self.assertEqual(len(els), 3)
for el in els:
self.assertEqual(el.name, 'p')
self.assertEqual(els[1]['class'], ['onep'])
self.assertFalse(els[0].has_attr('class'))
def test_a_bunch_of_emptys(self):
for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
self.assertEqual(len(self.soup.select(selector)), 0)
def test_multi_class_support(self):
for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
'.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
self.assertSelects(selector, ['pmulti'])
def test_multi_class_selection(self):
for selector in ('.class1.class3', '.class3.class2',
'.class1.class2.class3'):
self.assertSelects(selector, ['pmulti'])
def test_child_selector(self):
self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
self.assertSelects('.s1 > a span', ['s1a2s1'])
def test_child_selector_id(self):
self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
def test_attribute_equals(self):
self.assertSelectMultiple(
('p[class="onep"]', ['p1']),
('p[id="p1"]', ['p1']),
('[class="onep"]', ['p1']),
('[id="p1"]', ['p1']),
('link[rel="stylesheet"]', ['l1']),
('link[type="text/css"]', ['l1']),
('link[href="blah.css"]', ['l1']),
('link[href="no-blah.css"]', []),
('[rel="stylesheet"]', ['l1']),
('[type="text/css"]', ['l1']),
('[href="blah.css"]', ['l1']),
('[href="no-blah.css"]', []),
('p[href="no-blah.css"]', []),
('[href="no-blah.css"]', []),
)
def test_attribute_tilde(self):
self.assertSelectMultiple(
('p[class~="class1"]', ['pmulti']),
('p[class~="class2"]', ['pmulti']),
('p[class~="class3"]', ['pmulti']),
('[class~="class1"]', ['pmulti']),
('[class~="class2"]', ['pmulti']),
('[class~="class3"]', ['pmulti']),
('a[rel~="friend"]', ['bob']),
('a[rel~="met"]', ['bob']),
('[rel~="friend"]', ['bob']),
('[rel~="met"]', ['bob']),
)
def test_attribute_startswith(self):
self.assertSelectMultiple(
('[rel^="style"]', ['l1']),
('link[rel^="style"]', ['l1']),
('notlink[rel^="notstyle"]', []),
('[rel^="notstyle"]', []),
('link[rel^="notstyle"]', []),
('link[href^="bla"]', ['l1']),
('a[href^="http://"]', ['bob', 'me']),
('[href^="http://"]', ['bob', 'me']),
('[id^="p"]', ['pmulti', 'p1']),
('[id^="m"]', ['me', 'main']),
('div[id^="m"]', ['main']),
('a[id^="m"]', ['me']),
)
def test_attribute_endswith(self):
self.assertSelectMultiple(
('[href$=".css"]', ['l1']),
('link[href$=".css"]', ['l1']),
('link[id$="1"]', ['l1']),
('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']),
('div[id$="1"]', []),
('[id$="noending"]', []),
)
def test_attribute_contains(self):
self.assertSelectMultiple(
# From test_attribute_startswith
('[rel*="style"]', ['l1']),
('link[rel*="style"]', ['l1']),
('notlink[rel*="notstyle"]', []),
('[rel*="notstyle"]', []),
('link[rel*="notstyle"]', []),
('link[href*="bla"]', ['l1']),
('a[href*="http://"]', ['bob', 'me']),
('[href*="http://"]', ['bob', 'me']),
('[id*="p"]', ['pmulti', 'p1']),
('div[id*="m"]', ['main']),
('a[id*="m"]', ['me']),
# From test_attribute_endswith
('[href*=".css"]', ['l1']),
('link[href*=".css"]', ['l1']),
('link[id*="1"]', ['l1']),
('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']),
('div[id*="1"]', []),
('[id*="noending"]', []),
# New for this test
('[href*="."]', ['bob', 'me', 'l1']),
('a[href*="."]', ['bob', 'me']),
('link[href*="."]', ['l1']),
('div[id*="n"]', ['main', 'inner']),
('div[id*="nn"]', ['inner']),
)
def test_attribute_exact_or_hypen(self):
self.assertSelectMultiple(
('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
('p[lang|="fr"]', ['lang-fr']),
('p[lang|="gb"]', []),
)
def test_attribute_exists(self):
self.assertSelectMultiple(
('[rel]', ['l1', 'bob', 'me']),
('link[rel]', ['l1']),
('a[rel]', ['bob', 'me']),
('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
('p[class]', ['p1', 'pmulti']),
('[blah]', []),
('p[blah]', []),
)
def test_nth_of_type(self):
# Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
# Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Another')
# Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)')
self.assertEqual(len(els), 0)
# Pass in an invalid value.
self.assertRaises(
ValueError, self.soup.select, 'div p:nth-of-type(0)')
def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)')
self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text')
def test_id_child_selector_nth_of_type(self):
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
def test_select_on_element(self):
# Other tests operate on the tree; this operates on an element
# within the tree.
inner = self.soup.find("div", id="main")
selected = inner.select("div")
# The tag was selected. The
您还没有登录,登录后您可以:
-
收藏Android系统代码
-
收藏喜欢的文章
-
多个平台共享账号
去登录
首次使用?从这里 注册