同一レベルで順に舐めていくときは、findNextSibling(), 下からならfindPreviousSibling(),
複数形になるとそこを起点に全部
BeautifulSoupで読むのに必須なのはTagクラス。
でTagクラスには
findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
基底クラスのPageElementには
findNext(self, name=None, attrs={}, text=None, **kwargs)
findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs)
findPrevious(self, name=None, attrs={}, text=None, **kwargs)
findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs)
findParent(self, name=None, attrs={}, **kwargs)
findParents(self, name=None, attrs={}, limit=None, **kwargs)
findNextSibling(self, name=None, attrs={}, text=None, **kwargs)
findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs)
findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs)
で、肝になる関数が
_findOne(self, method, name, attrs, text, **kwargs)
_findAll(self, name, attrs, text, limit, generator, **kwargs)
_findOneが1個_findAllが複数探してきてくれる。_findAllのgeneratorにnextGenerator, previousGenerator, nextSiblingGenerator, previousSiblingGeneratorを渡すことで舐める方向を決めている。
例
<html>example
<head>
<title>
Page title
</title>
</head>
<body>
<p id="firstpara" align="center">
This is paragraph one.
</p>
<p id="secondpara" align="blah">
This is paragraph two.
</p>
<div>
first level
<div>
second level
<div>
third level
</div>
</div>
</div>
</body>
</html>
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from BeautifulSoup import BeautifulSoup
fi = open(u'test.html', u'r')
soup = BeautifulSoup(''.join(fi.read()))
print u'### document'
print u'tag name: ', soup.name
html = soup.html
print u'### top html'
print u'tag name: ', html.name
# findNext
head = html.findNext()
print u'### findNext @ html'
print u'tag name: ', head.name
title = head.findNext()
print u'### findNext @ head'
print u'tag name: ', title.name
body = title.findNext()
print u'### findNext @ title'
print u'tag name: ', body.name
# findPrevious
current = body.findPrevious()
print u'### findPrevious @ body'
print u'tag name: ', current.name
current = current.findPrevious()
print u'### findPrevious @ title'
print u'tag name: ', current.name
current = current.findPrevious()
print u'### findPrevious @ head'
print u'tag name: ', current.name
# findAllNext
current = body.findAllNext()
print u'### findAllNext @ body'
for item in current:
print u'tag name: ', item.name
# findAllPrevious
current = body.findAllPrevious()
print u'### findAllPrevious @ body'
for item in current:
print u'tag name: ', item.name
# findParent
current = body.findParent()
print u'### findParent @ body'
print u'tag name: ', current.name
current = title.findParent()
print u'### findParent @ title'
print u'tag name: ', current.name
# findParents
current = body.findParents()
print u'### findParents @ title'
print u'from tag name: ', body.name
for item in current:
print u'tag name: ', item.name
bottom = body.div.div.div
print u'### findParents @ bottom of div'
print u'from tag name: ', bottom.name
current = bottom.findParents()
for item in current:
print u'tag name: ', item.name
# findNextSibling
p = body.findNext()
print u'### findNextSibling @ body.p'
print u'from tag name: ', p.name
print p.text
current = p.findNextSibling()
print u'tag name: ', current.name
print current.text
# findNextSiblings
p = body.findNext()
print u'### findNextSiblings @ body.p'
print u'from tag name: ', p.name
print p.text
current = p.findNextSiblings()
print current
for item in current:
print u'tag name: ', item.name
# findPreviousSibling
current = body.div
print u'### findPreviousSibling @ body.div'
print u'from tag name: ', current.name
current = current.findPreviousSibling()
print u'tag name: ', current.name
print current.text
# findPreviousSiblings
current = body.div
print u'### findPreviousSiblings @ body.div'
print u'from tag name: ', current.name
current = current.findPreviousSiblings()
print current
for item in current:
print u'tag name: ', item.name
print u'tag text: ', item.text
0 件のコメント:
コメントを投稿