From 1efe991ed55a0feaabfc5841f7f621e214e570d5 Mon Sep 17 00:00:00 2001 From: Cutipus Date: Tue, 18 Sep 2018 09:50:52 +0300 Subject: [PATCH] First commit. --- facebookgetter.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 facebookgetter.py diff --git a/facebookgetter.py b/facebookgetter.py new file mode 100644 index 0000000..798612c --- /dev/null +++ b/facebookgetter.py @@ -0,0 +1,29 @@ +import urllib.request +from bs4 import BeautifulSoup as BSoup + +from operator import is_not +from functools import partial +from pprint import pprint as print + +# Hierarchy goes like this: div “userContent” -> + +url = "https://www.facebook.com/pg/tel.aviv.university.confessions/posts/" +html = urllib.request.urlopen(url).read().decode('utf-8') + +def get_messages(html): + # div “userContent” -> div “text_exposed_root” -> p tags with content + soup = BSoup(html, 'html.parser') + message_boxes = soup.find_all('div', class_='userContent') + exists = partial(is_not, None) + inner_message_boxes = filter(exists, (c.find('div', class_='text_exposed_root') + for c in message_boxes)) + tagged_message = [m.find_all('p') for m in inner_message_boxes] + + # each message as a string + for message in tagged_message: + yield '\n'.join(line.get_text() for line in message) + + +all_messages = '\n\n'.join(get_messages(html)) +with open('testaru.txt', 'wb') as f: + f.write(all_messages.encode()) \ No newline at end of file