lain-ipfs-scraper/scraper.py

46 lines
2.7 KiB
Python
Raw Normal View History

2016-06-30 05:05:31 +02:00
import urllib.request, json, BTEdb, re, time
db = BTEdb.Database("/dev/shm/lainchan-scraper.json")
2016-08-22 04:56:20 +02:00
# Do not save a backup to revert the transaction from
2016-06-30 05:05:31 +02:00
db.BeginTransaction(False)
2016-08-22 04:56:20 +02:00
# This is displayed directly on the page.
2016-07-30 04:28:39 +02:00
status = open("/dev/shm/lainchan-scraper-status", "w")
status.write("Update in progress")
status.close()
2016-08-22 04:56:20 +02:00
# Add more boards as needed
2016-06-30 05:05:31 +02:00
boards = ['lam', "tech"]
2016-08-22 04:56:20 +02:00
# This regex attempts to find IPFS hashes. Right now it just looks for 46 letter long words. There's a better way because they all start with the same two character string but I will add that in a later update
2016-06-30 05:05:31 +02:00
regex = re.compile(r"\b[A-Za-z0-9]{46}\b")
2016-08-22 04:56:20 +02:00
# Clear last scrape's results
2016-06-30 05:05:31 +02:00
if db.TableExists("table"):
db.Truncate("table")
else:
db.CreateTable("table")
2016-08-22 04:56:20 +02:00
for board in boards: # From here it's pretty straightforward
2016-06-30 05:05:31 +02:00
threads = json.loads(urllib.request.urlopen("https://lainchan.org/"+board+"/threads.json").read().decode("utf-8"))
for page in threads:
for thread in page["threads"]:
print(thread["no"])
2016-08-22 04:56:20 +02:00
time.sleep(5) # Sleep 5 seconds between thread requests, as a courtesy and to not overload the site.
2016-06-30 05:05:31 +02:00
for post in json.loads(urllib.request.urlopen("https://lainchan.org/" + board + "/res/" + str(thread["no"]) + ".json").read().decode("utf-8"))["posts"]:
2016-08-22 04:56:20 +02:00
if "com" in post: # com is the html text of the post
2016-06-30 05:05:31 +02:00
result = re.search(regex, post["com"])
if result:
2016-08-22 04:56:20 +02:00
i = 0 # From here down is a hack to actually get the matching text (the id) out of the regex results so we can actually generate URLs and print it to the site
2016-06-30 05:05:31 +02:00
while True:
try:
db.Insert("table", board = board, match = result.group(i), parent_thread_id = thread["no"], time = post["time"], text = post["com"], post = post["no"])
print(post["com"])
except:
break
i+= 1
2016-08-22 04:56:20 +02:00
# Clean up
2016-06-30 05:05:31 +02:00
db.CommitTransaction()
db.Destroy()
2016-07-30 04:28:39 +02:00
import time
status = open("/dev/shm/lainchan-scraper-status", "w")
2016-08-22 04:56:20 +02:00
# The line below looks complicated but it's not.
# Last scrape at (current time). Next scrape at (next hour after the current time)
# We take the current time and modulo it 3600 to get the seconds since the last hour. We then take 3600 - that value to get the seconds until the next hour. We add the result of that to the current time to get the time of the next "o'clock" hour
2016-07-30 04:28:39 +02:00
status.write("Last scrape at <span class=\"date\">" + str(int(time.time())) + "</span><br /> Next scrape at <span class=\"date\">" + str(3600 - (int(time.time()) % 3600) + int(time.time())) + "</span>")
status.close()