import urllib.request from bs4 import BeautifulSoup as BSoup from operator import is_not from functools import partial from pprint import pprint as print # Hierarchy goes like this: div “userContent” -> url = "https://www.facebook.com/pg/tel.aviv.university.confessions/posts/" html = urllib.request.urlopen(url).read().decode('utf-8') def get_messages(html): # div “userContent” -> div “text_exposed_root” -> p tags with content soup = BSoup(html, 'html.parser') message_boxes = soup.find_all('div', class_='userContent') exists = partial(is_not, None) inner_message_boxes = filter(exists, (c.find('div', class_='text_exposed_root') for c in message_boxes)) tagged_message = [m.find_all('p') for m in inner_message_boxes] # each message as a string for message in tagged_message: yield '\n'.join(line.get_text() for line in message) all_messages = '\n\n'.join(get_messages(html)) with open('testaru.txt', 'wb') as f: f.write(all_messages.encode())