First commit.

2018-09-18 09:50:52 +03:00 · 2018-09-18 09:50:52 +03:00 · 1efe991ed5
commit 1efe991ed5
1 changed files with 29 additions and 0 deletions
--- a/facebookgetter.py
+++ b/facebookgetter.py
@ -0,0 +1,29 @@
+import urllib.request
+from bs4 import BeautifulSoup as BSoup
+
+from operator import is_not
+from functools import partial
+from pprint import pprint as print
+
+# Hierarchy goes like this: div “userContent” -> 
+
+url = "https://www.facebook.com/pg/tel.aviv.university.confessions/posts/"
+html = urllib.request.urlopen(url).read().decode('utf-8')
+
+def get_messages(html):
+    # div “userContent” -> div “text_exposed_root” -> p tags with content
+    soup = BSoup(html, 'html.parser')
+    message_boxes = soup.find_all('div', class_='userContent')
+    exists = partial(is_not, None)
+    inner_message_boxes = filter(exists, (c.find('div', class_='text_exposed_root')
+                                      for c in message_boxes))
+    tagged_message = [m.find_all('p') for m in inner_message_boxes]
+
+    # each message as a string
+    for message in tagged_message:
+        yield '\n'.join(line.get_text() for line in message)
+
+
+all_messages = '\n\n'.join(get_messages(html))
+with open('testaru.txt', 'wb') as f:
+    f.write(all_messages.encode())