First commit.
This commit is contained in:
commit
1efe991ed5
|
@ -0,0 +1,29 @@
|
||||||
|
import urllib.request
|
||||||
|
from bs4 import BeautifulSoup as BSoup
|
||||||
|
|
||||||
|
from operator import is_not
|
||||||
|
from functools import partial
|
||||||
|
from pprint import pprint as print
|
||||||
|
|
||||||
|
# Hierarchy goes like this: div “userContent” ->
|
||||||
|
|
||||||
|
url = "https://www.facebook.com/pg/tel.aviv.university.confessions/posts/"
|
||||||
|
html = urllib.request.urlopen(url).read().decode('utf-8')
|
||||||
|
|
||||||
|
def get_messages(html):
|
||||||
|
# div “userContent” -> div “text_exposed_root” -> p tags with content
|
||||||
|
soup = BSoup(html, 'html.parser')
|
||||||
|
message_boxes = soup.find_all('div', class_='userContent')
|
||||||
|
exists = partial(is_not, None)
|
||||||
|
inner_message_boxes = filter(exists, (c.find('div', class_='text_exposed_root')
|
||||||
|
for c in message_boxes))
|
||||||
|
tagged_message = [m.find_all('p') for m in inner_message_boxes]
|
||||||
|
|
||||||
|
# each message as a string
|
||||||
|
for message in tagged_message:
|
||||||
|
yield '\n'.join(line.get_text() for line in message)
|
||||||
|
|
||||||
|
|
||||||
|
all_messages = '\n\n'.join(get_messages(html))
|
||||||
|
with open('testaru.txt', 'wb') as f:
|
||||||
|
f.write(all_messages.encode())
|
Loading…
Reference in New Issue