Documentation

2016-08-21 22:56:20 -04:00 · 2016-08-21 22:56:20 -04:00 · 19acb88368
parent b2b590e4c9
commit 19acb88368
2 changed files with 19 additions and 9 deletions
--- a/index.pyhtml
+++ b/index.pyhtml
@ -17,11 +17,13 @@
 		<script type="text/javascript" src="https://lainchan.org/js/hilight.js"></script>
 		<script>
 			$( document ).ready(function() {
 				// On document ready, find all date elements
 				var s = document.getElementsByClassName("date");
 				for (var i = 0; i < s.length; i++) {
 					// Foreach el in document.find(".date")
 					var el = s[i];
 					d = new Date(parseInt(el.innerHTML + "000")); // uses milliseconds, hence the extra 000
-					/*el.innerHTML = d.toLocaleDateString() + " " + d.toLocaleTimeString();*/
+					// change the html of el from an epoch to the formatted local time and date of that epoch
 					el.innerHTML = d.toLocaleTimeString() + " " + d.toLocaleDateString();
 				}
 			});
@ -31,13 +33,16 @@
 		<h1>IPFS Scraper</h1>
 		<div id="status">${open("/dev/shm/lainchan-scraper-status", "r").read()}</div>
 		<div id="content">
 			<!-- reverse dumped -->
 			% for post in dumped[::-1]:
 				<div class="post">
 					<!-- Generate a link back to the lainchan thread, as well as links to download the ipfs hash from multiple sources, and print the hash itself. -->
 					<a class="postlink" href="https://lainchan.org/${post["board"]}/res/${post["parent_thread_id"]}.html#q${post["post"]}" target="_blank">Post</a> -
 					<a class="ipfslink-public" href="https://ipfs.io/ipfs/${post["match"]}">public</a>
 					<a class="ipfslink-local" href="http://localhost:8080/ipfs/${post["match"]}">local</a> -
 					${post["match"]}
 					<span class="date">${post["time"]}</span>
 					<!-- We must escape the lambda because either mako, mako-server or paste refuses to print unicode characters -->
 					<div class="embedded">${post["text"].replace(u"\u03bb", "lam").replace('href="/' + post["board"], 'href="https://lainchan.org/' + post["board"])}</div>
 				</div>
 			% endfor
--- a/scraper.py
+++ b/scraper.py
@ -1,30 +1,31 @@
 import urllib.request, json, BTEdb, re, time
 db = BTEdb.Database("/dev/shm/lainchan-scraper.json")
 # Do not save a backup to revert the transaction from
 db.BeginTransaction(False)
 # This is displayed directly on the page.
 status = open("/dev/shm/lainchan-scraper-status", "w")
 status.write("Update in progress")
 status.close()
 # Add more boards as needed
 boards = ['lam', "tech"]
 # This regex attempts to find IPFS hashes. Right now it just looks for 46 letter long words. There's a better way because they all start with the same two character string but I will add that in a later update
 regex = re.compile(r"\b[A-Za-z0-9]{46}\b")
 # Clear last scrape's results
 if db.TableExists("table"):
    db.Truncate("table")
 else:
    db.CreateTable("table")
-for board in boards:
+for board in boards: # From here it's pretty straightforward
    # DEBUG
    # time.sleep(5)
    # continue
    # END DEBUG
    threads = json.loads(urllib.request.urlopen("https://lainchan.org/"+board+"/threads.json").read().decode("utf-8"))
    for page in threads:
        for thread in page["threads"]:
            print(thread["no"])
-            time.sleep(5)
+            time.sleep(5) # Sleep 5 seconds between thread requests, as a courtesy and to not overload the site.
            for post in json.loads(urllib.request.urlopen("https://lainchan.org/" + board + "/res/" + str(thread["no"]) + ".json").read().decode("utf-8"))["posts"]:
-                if "com" in post:
+                if "com" in post: # com is the html text of the post
                    result = re.search(regex, post["com"])
                    if result:
-                        i = 0
+                        i = 0 # From here down is a hack to actually get the matching text (the id) out of the regex results so we can actually generate URLs and print it to the site
                        while True:
                            try:
                                db.Insert("table", board = board, match = result.group(i), parent_thread_id = thread["no"], time = post["time"], text = post["com"], post = post["no"])
@ -32,9 +33,13 @@ for board in boards:
                            except:
                                break
                            i+= 1
 # Clean up
 db.CommitTransaction()
 db.Destroy()
 import time
 status = open("/dev/shm/lainchan-scraper-status", "w")
 # The line below looks complicated but it's not.
 # Last scrape at (current time). Next scrape at (next hour after the current time)
 # We take the current time and modulo it 3600 to get the seconds since the last hour. We then take 3600 - that value to get the seconds until the next hour. We add the result of that to the current time to get the time of the next "o'clock" hour
 status.write("Last scrape at <span class=\"date\">" + str(int(time.time())) + "</span><br /> Next scrape at <span class=\"date\">" + str(3600 - (int(time.time()) % 3600) + int(time.time())) + "</span>")
 status.close()