Documentation
This commit is contained in:
parent
b2b590e4c9
commit
19acb88368
|
@ -17,11 +17,13 @@
|
||||||
<script type="text/javascript" src="https://lainchan.org/js/hilight.js"></script>
|
<script type="text/javascript" src="https://lainchan.org/js/hilight.js"></script>
|
||||||
<script>
|
<script>
|
||||||
$( document ).ready(function() {
|
$( document ).ready(function() {
|
||||||
|
// On document ready, find all date elements
|
||||||
var s = document.getElementsByClassName("date");
|
var s = document.getElementsByClassName("date");
|
||||||
for (var i = 0; i < s.length; i++) {
|
for (var i = 0; i < s.length; i++) {
|
||||||
|
// Foreach el in document.find(".date")
|
||||||
var el = s[i];
|
var el = s[i];
|
||||||
d = new Date(parseInt(el.innerHTML + "000")); // uses milliseconds, hence the extra 000
|
d = new Date(parseInt(el.innerHTML + "000")); // uses milliseconds, hence the extra 000
|
||||||
/*el.innerHTML = d.toLocaleDateString() + " " + d.toLocaleTimeString();*/
|
// change the html of el from an epoch to the formatted local time and date of that epoch
|
||||||
el.innerHTML = d.toLocaleTimeString() + " " + d.toLocaleDateString();
|
el.innerHTML = d.toLocaleTimeString() + " " + d.toLocaleDateString();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -31,13 +33,16 @@
|
||||||
<h1>IPFS Scraper</h1>
|
<h1>IPFS Scraper</h1>
|
||||||
<div id="status">${open("/dev/shm/lainchan-scraper-status", "r").read()}</div>
|
<div id="status">${open("/dev/shm/lainchan-scraper-status", "r").read()}</div>
|
||||||
<div id="content">
|
<div id="content">
|
||||||
|
<!-- reverse dumped -->
|
||||||
% for post in dumped[::-1]:
|
% for post in dumped[::-1]:
|
||||||
<div class="post">
|
<div class="post">
|
||||||
|
<!-- Generate a link back to the lainchan thread, as well as links to download the ipfs hash from multiple sources, and print the hash itself. -->
|
||||||
<a class="postlink" href="https://lainchan.org/${post["board"]}/res/${post["parent_thread_id"]}.html#q${post["post"]}" target="_blank">Post</a> -
|
<a class="postlink" href="https://lainchan.org/${post["board"]}/res/${post["parent_thread_id"]}.html#q${post["post"]}" target="_blank">Post</a> -
|
||||||
<a class="ipfslink-public" href="https://ipfs.io/ipfs/${post["match"]}">public</a>
|
<a class="ipfslink-public" href="https://ipfs.io/ipfs/${post["match"]}">public</a>
|
||||||
<a class="ipfslink-local" href="http://localhost:8080/ipfs/${post["match"]}">local</a> -
|
<a class="ipfslink-local" href="http://localhost:8080/ipfs/${post["match"]}">local</a> -
|
||||||
${post["match"]}
|
${post["match"]}
|
||||||
<span class="date">${post["time"]}</span>
|
<span class="date">${post["time"]}</span>
|
||||||
|
<!-- We must escape the lambda because either mako, mako-server or paste refuses to print unicode characters -->
|
||||||
<div class="embedded">${post["text"].replace(u"\u03bb", "lam").replace('href="/' + post["board"], 'href="https://lainchan.org/' + post["board"])}</div>
|
<div class="embedded">${post["text"].replace(u"\u03bb", "lam").replace('href="/' + post["board"], 'href="https://lainchan.org/' + post["board"])}</div>
|
||||||
</div>
|
</div>
|
||||||
% endfor
|
% endfor
|
||||||
|
|
21
scraper.py
21
scraper.py
|
@ -1,30 +1,31 @@
|
||||||
import urllib.request, json, BTEdb, re, time
|
import urllib.request, json, BTEdb, re, time
|
||||||
db = BTEdb.Database("/dev/shm/lainchan-scraper.json")
|
db = BTEdb.Database("/dev/shm/lainchan-scraper.json")
|
||||||
|
# Do not save a backup to revert the transaction from
|
||||||
db.BeginTransaction(False)
|
db.BeginTransaction(False)
|
||||||
|
# This is displayed directly on the page.
|
||||||
status = open("/dev/shm/lainchan-scraper-status", "w")
|
status = open("/dev/shm/lainchan-scraper-status", "w")
|
||||||
status.write("Update in progress")
|
status.write("Update in progress")
|
||||||
status.close()
|
status.close()
|
||||||
|
# Add more boards as needed
|
||||||
boards = ['lam', "tech"]
|
boards = ['lam', "tech"]
|
||||||
|
# This regex attempts to find IPFS hashes. Right now it just looks for 46 letter long words. There's a better way because they all start with the same two character string but I will add that in a later update
|
||||||
regex = re.compile(r"\b[A-Za-z0-9]{46}\b")
|
regex = re.compile(r"\b[A-Za-z0-9]{46}\b")
|
||||||
|
# Clear last scrape's results
|
||||||
if db.TableExists("table"):
|
if db.TableExists("table"):
|
||||||
db.Truncate("table")
|
db.Truncate("table")
|
||||||
else:
|
else:
|
||||||
db.CreateTable("table")
|
db.CreateTable("table")
|
||||||
for board in boards:
|
for board in boards: # From here it's pretty straightforward
|
||||||
# DEBUG
|
|
||||||
# time.sleep(5)
|
|
||||||
# continue
|
|
||||||
# END DEBUG
|
|
||||||
threads = json.loads(urllib.request.urlopen("https://lainchan.org/"+board+"/threads.json").read().decode("utf-8"))
|
threads = json.loads(urllib.request.urlopen("https://lainchan.org/"+board+"/threads.json").read().decode("utf-8"))
|
||||||
for page in threads:
|
for page in threads:
|
||||||
for thread in page["threads"]:
|
for thread in page["threads"]:
|
||||||
print(thread["no"])
|
print(thread["no"])
|
||||||
time.sleep(5)
|
time.sleep(5) # Sleep 5 seconds between thread requests, as a courtesy and to not overload the site.
|
||||||
for post in json.loads(urllib.request.urlopen("https://lainchan.org/" + board + "/res/" + str(thread["no"]) + ".json").read().decode("utf-8"))["posts"]:
|
for post in json.loads(urllib.request.urlopen("https://lainchan.org/" + board + "/res/" + str(thread["no"]) + ".json").read().decode("utf-8"))["posts"]:
|
||||||
if "com" in post:
|
if "com" in post: # com is the html text of the post
|
||||||
result = re.search(regex, post["com"])
|
result = re.search(regex, post["com"])
|
||||||
if result:
|
if result:
|
||||||
i = 0
|
i = 0 # From here down is a hack to actually get the matching text (the id) out of the regex results so we can actually generate URLs and print it to the site
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
db.Insert("table", board = board, match = result.group(i), parent_thread_id = thread["no"], time = post["time"], text = post["com"], post = post["no"])
|
db.Insert("table", board = board, match = result.group(i), parent_thread_id = thread["no"], time = post["time"], text = post["com"], post = post["no"])
|
||||||
|
@ -32,9 +33,13 @@ for board in boards:
|
||||||
except:
|
except:
|
||||||
break
|
break
|
||||||
i+= 1
|
i+= 1
|
||||||
|
# Clean up
|
||||||
db.CommitTransaction()
|
db.CommitTransaction()
|
||||||
db.Destroy()
|
db.Destroy()
|
||||||
import time
|
import time
|
||||||
status = open("/dev/shm/lainchan-scraper-status", "w")
|
status = open("/dev/shm/lainchan-scraper-status", "w")
|
||||||
|
# The line below looks complicated but it's not.
|
||||||
|
# Last scrape at (current time). Next scrape at (next hour after the current time)
|
||||||
|
# We take the current time and modulo it 3600 to get the seconds since the last hour. We then take 3600 - that value to get the seconds until the next hour. We add the result of that to the current time to get the time of the next "o'clock" hour
|
||||||
status.write("Last scrape at <span class=\"date\">" + str(int(time.time())) + "</span><br /> Next scrape at <span class=\"date\">" + str(3600 - (int(time.time()) % 3600) + int(time.time())) + "</span>")
|
status.write("Last scrape at <span class=\"date\">" + str(int(time.time())) + "</span><br /> Next scrape at <span class=\"date\">" + str(3600 - (int(time.time()) % 3600) + int(time.time())) + "</span>")
|
||||||
status.close()
|
status.close()
|
||||||
|
|
Loading…
Reference in New Issue