Add feature to import articles from mbox files to maintenance.php.

This commit is contained in:
Retro_Guy 2024-12-12 05:53:25 -07:00
parent 4209a8befb
commit 2dda14f55a
1 changed files with 376 additions and 1 deletions

View File

@ -36,7 +36,7 @@ if ($argv[1] != '-newsection') {
/* Change to non root user */
change_identity($uinfo["uid"], $uinfo["gid"]);
$processUser = posix_getpwuid(posix_geteuid());
if ($processUser['name'] != $CONFIG['webserver_user']) {
echo "You are running as user: " . $processUser['name'] . "\n";
echo 'Please run this script as: ' . $CONFIG['webserver_user'] . "\n";
@ -52,6 +52,12 @@ if ($argv[1] != '-newsection') {
exit();
}
if(isset($argv[2])) {
$config_name = get_section_by_group($argv[2]);
} else {
$config_name = null;
}
$logfile = $logdir . '/spoolnews.log';
$lockfile = $lockdir . '/' . $config_name . '-spoolnews.lock';
@ -117,6 +123,17 @@ if ($argv[1][0] == '-') {
echo "Creating section: " . $argv[2] . "\n";
echo create_section($argv[2]);
break;
case "-import-mbox-to-articles":
if (!isset($argv[2]) || !isset($argv[3])) {
echo "Please provide a group name followed by full path to mbox file\n";
exit;
}
if(!isset($config_name) {
echo "Please add " . $argv[2] . " to groups.txt for a section\n";
exit;
}
mbox_import_articles($argv[2], $argv[3]);
break;
case "-clean":
clean_spool();
break;
@ -130,6 +147,9 @@ if ($argv[1][0] == '-') {
echo "-clear-diskcache: Remove all cache files if using Disk Caching\n";
echo "-import: Import articles from a .db3 file (-import alt.test-articles)\n";
echo " You must first add group name to <config_dir>/<section>/groups.txt manually\n";
echo "-import-mbox-to-articles: Import articles from a mbox file\n";
echo " -import-mbox-to-articles alt.test /path/to/file/alt.test.mbox\n";
echo " You must first add group name to <config_dir>/<section>/groups.txt manually\n";
echo "-newsection: Create a new section for groups\n";
echo "-refill: Go back x articles and retrieve missing from remote server\n";
echo " -refill alt.test 3000 will retrive missing articles for alt.test\n";
@ -426,6 +446,361 @@ function remove_articles($group)
@unlink($spooldir . '/' . $group . '-overboard.dat');
}
function mbox_import_articles($group, $mbox)
{
global $spooldir, $CONFIG, $workpath, $path, $config_name, $logfile;
# Prepare databases
$database = $spooldir . '/' . $group . '-articles.db3';
if (file_exists($database)) {
echo $database . ' already exists. Please "-remove ' . $group . '" to import this mbox data' . "\n";
echo "Exiting...\n";
exit;
}
$new_article_dbh = article_db_open($database);
if (!file_exists($mbox)) {
echo 'Cannot open ' . $mbox . "\n";
echo "Exiting...\n";
exit;
}
$article = 0;
$mbox_article = array();
$file = fopen($mbox, 'r');
while (($mbox_line = fgets($file)) !== false) {
if ($article == 0 && trim($mbox_line == '')) {
continue;
}
if (preg_match("/^From /", $mbox_line)) {
if ($article > 0) {
mbox_get_one_article($group, $mbox_article);
echo "Retrieving " . $article . "\n";
$mbox_article = array();
}
$article++;
continue;
}
$mbox_article[] = rtrim($mbox_line);
}
fclose($file);
mbox_get_one_article($group, $mbox_article);
echo "Retrieving " . $article . "\n";
exit;
}
// GET INDIVIDUAL ARTICLE
function mbox_get_one_article($group, $article_array)
{
global $CONFIG, $config_name, $rslight_gpg, $spooldir, $logfile, $debug_log;
$grouppath = $spooldir . '/articles/' . preg_replace('/\./', '/', $group);
if (!is_dir($grouppath)) {
mkdir($grouppath, 0777, 'recursive');
}
// $banned_names = file($user_ban_file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$msgid_filter = get_config_value('header_filters.conf', 'Message-ID');
$subject_filter = get_config_value('header_filters.conf', 'Subject');
$from_filter = get_config_value('header_filters.conf', 'From');
$path_filter = get_config_value('header_filters.conf', 'Path');
// Create array for article, then send to insert_article_from_array()
if (isset($current_article)) {
unset($current_article);
$current_article = array();
}
$local = get_next_article_number($group);
$articleHandle = $grouppath . "/" . $local;
unlink($articleHandle);
unset($references);
$lines = 0;
$bytes = 0;
$ref = 0;
$sub = 0;
$ng = 0;
$supersedes = false;
$boundary = false;
$banned = false;
$integrity = false;
$is_header = 1;
$body = "";
$content_transfer_encoding = null;
foreach ($article_array as $response) {
$is_xref = false;
$bytes = $bytes + mb_strlen($response, '8bit');
if (trim($response) == "" && $lines > 0) {
if ($is_header == 1) {
file_put_contents($articleHandle, $current_article['xref'] . "\n", FILE_APPEND);
}
$is_header = 0;
}
if ($is_header == 1) {
$response = str_replace("\t", " ", $response);
if (strpos($response, ': ') !== false) {
$ref = 0;
$sub = 0;
$ng = 0;
}
// Find article date
if (stripos($response, "Date: ") === 0) {
$finddate = explode(': ', $response, 2);
$artdate = strtotime($finddate[1]);
}
if (stripos($response, "Injection-Date: ") === 0) {
$finddate = explode(': ', $response, 2);
$injectiondate = strtotime($finddate[1]);
}
if (stripos($response, "Supersedes: ") === 0) {
$supersedes = explode(': ', $response, 2);
$supersedes = $supersedes[1];
}
// Get overview data
if (stripos($response, "Message-ID: ") === 0) {
$mid = explode(': ', $response, 2);
if (preg_match($msgid_filter, $mid[1])) {
$banned = "msgid_filter";
}
}
if (stripos($response, "From: ") === 0) {
$from = explode(': ', $response, 2);
if (preg_match($from_filter, $from[1])) {
$banned = "from_filter";
}
}
if (stripos($response, "Path: ") === 0) {
$msgpath = explode(': ', $response, 2);
if (preg_match($path_filter, $msgpath[1])) {
$banned = "path_filter";
}
}
if (stripos($response, "Subject: ") === 0) {
$this_subject = explode('Subject: ', $response, 2);
$subject = $this_subject[1];
$sub = 1;
if (preg_match($subject_filter, $subject)) {
$banned = "subject_filter";
}
}
// Transfer encoding
if (stripos($response, "Content-Transfer-Encoding: ") === 0) {
$enco = explode(': ', $response, 2);
$content_transfer_encoding = $enco[1];
}
if (stripos($response, "Newsgroups: ") === 0) {
$response = str_ireplace($group, $group, $response);
// Identify each group name for xref
$groupnames = explode("Newsgroups: ", $response);
$allgroups = preg_split("/\ |\,/", $groupnames[1]);
// Create Xref: header
$current_article['xref'] = "Xref: " . $CONFIG['pathhost'];
foreach ($allgroups as $agroup) {
$agroup = trim($agroup);
if ((! testGroup($agroup)) || $agroup == '') {
continue;
}
if ($group == $agroup) {
$artnum = $local;
} else {
$artnum = get_next_article_number($agroup);
}
if ($artnum > 0) {
$current_article['xref'] .= ' ' . $agroup . ':' . $artnum;
}
}
$ng = 1;
}
if (stripos($response, "Xref: ") === 0) {
if (isset($CONFIG['enable_nntp']) && $CONFIG['enable_nntp'] == true) {
$is_xref = true;
}
$xref = $response;
}
if (stripos($response, "Content-Type: ") === 0) {
preg_match('/.*charset=.*/', $response, $te);
$content_type = explode("Content-Type: text/plain; charset=", $te[0]);
if (preg_match('/.*boundary=.*/', $response, $be)) {
$boundary = explode("boundary=", $response, 2);
$boundary = trim($boundary[1], '\";');
}
}
if (stripos($response, "References: ") === 0) {
$this_references = explode('References: ', $response);
$references = $this_references[1];
$ref = 1;
}
if (preg_match('/^\s/', $response) && $ng == 1) {
$addgroups = preg_split("/\ |\,/", trim($response));
$allgroups = array_merge($allgroups, $addgroups);
}
if (preg_match('/^\s/', $response) && $ref == 1) {
$references = $references . $response;
}
if (preg_match('/^\s/', $response) && $sub == 1) {
$subject = $subject . $response;
}
} else {
$body .= $response . "\n";
}
if (! $is_xref) {
file_put_contents($articleHandle, $response . "\n", FILE_APPEND);
}
$response = str_replace("\n", "", str_replace("\r", "", $response));
$lines++;
}
file_put_contents($articleHandle, $response . "\n", FILE_APPEND);
$lines = $lines - 1;
$bytes = $bytes + ($lines * 2);
// Prefer Injection-Date to Date header
// Some newsreaders (PiaoHong) produce a Date header that php does not like
if (isset($injectiondate)) {
$artdate = $injectiondate;
file_put_contents($debug_log, "\n" . format_log_date() . " " . $config_name . " Used Injection-Date " . $artdate . " for: " . $mid[1], FILE_APPEND);
} else {
file_put_contents($debug_log, "\n" . format_log_date() . " " . $config_name . " Used Date " . $artdate . " for: " . $mid[1], FILE_APPEND);
}
// Check if date matches exactly another article and handle else sorting doesn't like it
while (isset($dates_used[$artdate])) {
$artdate = $artdate + 1;
$finddate[1] = date("D, j M Y G:i:s (T)", $artdate);
file_put_contents($debug_log, "\n" . format_log_date() . " " . $config_name . " Rewrote date to: " . $artdate . " " . $finddate[1] . " for " . $group . ":" . $local, FILE_APPEND);
}
$article_date = $artdate;
$dates_used[$article_date] = true;
// Don't spool article if $banned or fails integrity test
$integrity = check_article_integrity(file($articleHandle), $artdate);
if (($banned !== false) || ($integrity !== false)) {
unlink($articleHandle);
if ($integrity) {
file_put_contents($logfile, "\n" . format_log_date() . " " . $integrity, FILE_APPEND);
} elseif ($banned) {
file_put_contents($spamlog, "\n" . format_log_date() . " " . $banned . " :\tSPAM\t" . $mid[1] . "\t" . $groupnames[1] . "\t" . $from[1], FILE_APPEND);
}
$article++;
} else {
if ((strpos($CONFIG['nocem_groups'], $group) !== false) && ($CONFIG['enable_nocem'] == true)) {
if (strpos($subject, $nocem_check) !== false) {
$is_from = address_decode($from[1], 'nowhere');
$nocem_file = tempnam($spooldir . "/nocem", $is_from[0]['mailbox'] . "@" . $is_from[0]['host'] . "[" . date("Y.m.d.H.i.s") . "]");
copy($articleHandle, $nocem_file);
chmod($nocem_file, 0644);
if ($save_nocem_messages == true) {
$saved_nocem_file = tempnam($nocem_dir, $is_from[0]['mailbox'] . "@" . $is_from[0]['host'] . "[" . date("Y.m.d.H.i.s") . "]-");
copy($articleHandle, $saved_nocem_file);
}
}
}
if ((strpos($rslight_gpg['nntp_group'], $group) !== false) && ($rslight_gpg['enable'] == '1')) {
if (strpos($subject, $bbsmail_check) !== false) {
$bbsmail_file = preg_replace('/@@RSL /', '', $subject);
$bbsmail_filename = $spooldir . "/bbsmail/in/bbsmail-" . $bbsmail_file;
copy($articleHandle, $bbsmail_filename);
}
}
$this_article = file_get_contents($articleHandle);
if ($CONFIG['article_database'] == '1') {
unlink($articleHandle);
// CREATE SEARCH SNIPPET
if ($boundary !== false) {
$body_array = explode("\n", $body);
$found = false;
$start = false;
foreach ($body_array as $line) {
if ($found === false) {
if (strpos($line, $boundary) !== false) {
$found = true;
continue;
} else {
continue;
}
}
if (trim($line != '') && $start === false) {
continue;
} else {
if ($start === false) {
$start = true;
continue;
}
}
$newbody .= $line . "\n";
}
file_put_contents($debug_log, "\n" . format_log_date() . " Created snippet from multipart article: " . $mid[1], FILE_APPEND);
} else {
$newbody = $body;
}
$this_snippet = get_search_snippet($newbody, $content_type[1], $content_transfer_encoding);
unset($newbody);
} else {
touch($articleHandle, $article_date);
}
$current_article['mid'] = $mid[1];
$current_article['epochdate'] = $article_date;
$current_article['stringdate'] = $finddate[1];
$current_article['from'] = $from[1];
$current_article['subject'] = $subject;
$current_article['references'] = $references;
$current_article['bytes'] = $bytes;
$current_article['lines'] = $lines;
$current_article['article'] = $this_article;
$current_article['snippet'] = $this_snippet;
// Check Spam
$res = 0;
if (isset($CONFIG['spamassassin']) && ($CONFIG['spamassassin'] == true) && ($OVERRIDES['disable_spamassassin_spooling'] !== true)) {
$spam_result_array = check_spam($subject, $from[1], $groupnames[1], $references, $this_article, $mid[1]);
$res = $spam_result_array['res'];
$spamresult = $spam_result_array['spamresult'];
$spamcheckerversion = $spam_result_array['spamcheckerversion'];
$spamlevel = $spam_result_array['spamlevel'];
}
if ($res === 1) {
unlink($articleHandle);
file_put_contents($logfile, "\n" . format_log_date() . " " . $config_name . " Skipping: " . $CONFIG['remote_server'] . " " . $group . ":" . $mid[1] . " Exceeds Spam Score", FILE_APPEND);
// $orig_newsgroups = $newsgroups;
// $newsgroups = $CONFIG['spamgroup'];
// $group = $newsgroups;
$local--;
} else {
$pass = false;
foreach ($allgroups as $agroup) {
$agroup = trim($agroup);
if ((! testGroup($agroup)) || $agroup == '') {
continue;
}
$current_article['group'] = $agroup;
if ($group == $agroup) {
$current_article['local'] = $local;
} else {
$current_article['local'] = get_next_article_number($agroup);
}
file_put_contents($logfile, "\n" . format_log_date() . " " . $config_name . " Preparing to spool " . $group . ":" . $mid[1], FILE_APPEND);
$tmp = insert_article_from_array($current_article, false);
if ($tmp[0] != "4") {
$pass = true;
} else {
file_put_contents($logfile, "\n" . format_log_date() . " " . $config_name . " " . $tmp, FILE_APPEND);
}
}
}
$local++;
}
if ($supersedes !== false) {
if (isset($OVERRIDES['enable_supersedes_support']) && $OVERRIDES['enable_supersedes_support'] == true) {
file_put_contents($debug_log, "\n" . format_log_date() . " Found Supersedes: " . $mid[1] . " for: " . $supersedes, FILE_APPEND);
if (!check_remote_for_msgid($supersedes)) {
file_put_contents($debug_log, "\n" . format_log_date() . " Will delete: " . $supersedes, FILE_APPEND);
delete_message($supersedes);
}
}
}
}
function import_articles($group)
{
global $spooldir, $CONFIG, $workpath, $path, $config_name, $logfile;