diff --git a/admin/rehash.php b/admin/rehash.php --- a/admin/rehash.php +++ b/admin/rehash.php @@ -60,7 +60,7 @@ function main($argc, $argv) if (isset($opts['crawl-only'])) $crawl_only = split(',', $opts['crawl-only']); - $verbosity = 1; + $verbosity = 5; if (isset($opts['verbosity'])) $verbosity = (int)$opts['verbosity']; if (isset($opts['V'])) @@ -75,7 +75,7 @@ function main($argc, $argv) if ($crawl) { - $ret = school_cache_recreate($crawl_only, $verbosity); + $ret = school_cache_recreate($crawl_only, NULL, $verbosity); if ($ret) { fprintf(STDERR, "error: Unable to successfully crawl schools.\n"); diff --git a/inc/admin.inc b/inc/admin.inc --- a/inc/admin.inc +++ b/inc/admin.inc @@ -250,13 +250,15 @@ function school_cache_semesters_sort (Se * \param $school * The school which should be checked for crawl functionality and * crawled. - * \param $semester_year - * The year of the semester for which we should grab data. - * \param $semester_season - * The season of the year of the semester for which we should grab - * data. + * \param $page + * The Page object for which HTML formatted logs should be outputted + * \param $verbosity + * How verbose to be. Sane values are from 0 through 10. + * \return + * A school_crawl_log handle, upopn which school_crawl_log_fetch() + * may be used. */ -function school_crawl(array &$school, $verbosity = 1) +function school_crawl(array &$school, Page $page = NULL, $verbosity = 1) { $school['crawled'] = FALSE; @@ -264,23 +266,32 @@ function school_crawl(array &$school, $v if (!function_exists($school_crawl_func)) return; + $school_crawl_log_opts = array('verbosity' => $verbosity); + if (defined('STDERR')) + $school_crawl_log_opts['stream'] = STDERR; + if ($page !== NULL) + $school_crawl_log_opts['page'] = $page; + $school_crawl_log = school_crawl_log_init($school, $school_crawl_log_opts); + $semesters = array(); if ($verbosity > 0) - fprintf(STDERR, "%s()\n", $school_crawl_func); - $ret = $school_crawl_func($semesters, $verbosity); + school_crawl_logf($school_crawl_log, 2, "Calling crawler..."); + + $ret = $school_crawl_func($semesters, $school_crawl_log, $verbosity); if ($ret) { - fprintf(STDERR, "Crawling %s failed: %s() returned nonzero\n", - $school['id'], $school_crawl_func); - fwrite(STDERR, "\n"); + school_crawl_logf($school_crawl_log, 1, "Crawling %s failed: %s() returned nonzero", + $school['id'], $school_crawl_func); + school_crawl_logf($school_crawl_log, 6, ""); return; } $school['crawled'] = TRUE; $school['crawled_semesters'] = $semesters; - if ($verbosity > 0) - fwrite(STDERR, "\n"); + school_crawl_logf($school_crawl_log, 6, ""); + + return $school_crawl_log; } @@ -295,7 +306,7 @@ function school_crawl(array &$school, $v * \param $verbosity * An integer indicating how loud to be. */ -function school_cache_recreate($crawl_only = NULL, $verbosity = 1) +function school_cache_recreate($crawl_only = NULL, Page $page = NULL, $verbosity = 5) { $school_id_list = school_list(); if (!$school_id_list) @@ -330,7 +341,7 @@ function school_cache_recreate($crawl_on if ($crawl_only === NULL || in_array($school['id'], $crawl_only)) { - school_crawl($school, $verbosity); + $school_crawl_log = school_crawl($school, $page, $verbosity); } else { diff --git a/inc/class.page.php b/inc/class.page.php --- a/inc/class.page.php +++ b/inc/class.page.php @@ -577,4 +577,19 @@ class page } } + /** + * \brief + * Generate special code to close a self-closing XHTML/HTML + * element. + * + * \return + * A string containing the correct self-closing chars. For + * example, this would be ' /' for XHTML. + */ + public function element_self_close() + { + if ($this->xhtml) + return ' /'; + return ''; + } } diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc --- a/inc/school.crawl.inc +++ b/inc/school.crawl.inc @@ -26,6 +26,91 @@ /** * \brief + * Initialize a school_crawl_log handle. + * + * \param $school + * The school for which this crawl handle is. + * \param $opts + * An array optionally with one of the following keys: + * - stream: an fopen()-compatible stream to fwrite()/fprintf() output to. + * - page: a Page object used to help format HTML output. + * - verbosity: A number from 0 through 10 describing the desired + * verbosity. + */ +function school_crawl_log_init(array $school, $opts = array()) +{ + $opts += array('verbosity' => 5); + return array('school' => $school, 'out' => array('html' => array(), 'plain' => array())) + $opts; +} + +/** + * \brief + * Log progress of a crawler. + * + * This function's arguments take the same style as fprintf() does. + * + * \param $school_crawl_log + * The logging resource. + * \param $verboseness + * The verbosity level at which to log the message. Should be a + * value from 0 to 10, where 0 is unconditionally printed and 5 is + * the default. + * \param $format + * The printf()-style format string. + */ +function school_crawl_logf(array $school_crawl_log, $verboseness, $format) +{ + $args = func_get_args(); + array_shift($args); + array_shift($args); + + if ($verboseness > $school_crawl_log['verbosity']) + /* + * The given message gives us more detail than we want. Therefore, + * discard it. + */ + return; + + $log_line = call_user_func_array('sprintf', $args); + + /* store output in a place where it's retrievable */ + $school_crawl_log['out']['plain'][] = sprintf("%s_crawl(): %s\n", + $school_crawl_log['school']['id'], $log_line); + + /* store the output in a retrievable list of outputs */ + if (isset($school_crawl_log['page'])) + $school_crawl_log['out']['html'][] = sprintf("
%s_crawl(): %s

\n", + $school_crawl_log['school']['id'], htmlentities($log_line), + $school_crawl_log['page']->element_self_close()); + + /* print to a stream potentially */ + if (isset($school_crawl_log['stream'])) + fprintf($school_crawl_log['stream'], "%s_crawl(): %s\n", $school_crawl_log['school']['id'], $log_line); + + return 0; +} + +/** + * \brief + * Recover stored crawling log stuffage. + * + * \param $html + * Whether to retrieve formatted HTML output if it's available. + * \return + * An array of output lines. + */ +function school_crawl_log_fetch(array $school_crawl_log, $html = FALSE) +{ + if ($html) + if (isset($school_crawl_log['page'])) + return $school_crawl_log['out']['html']; + else + return nl2br(htmlentities($school_crawl_log['out']['plain'])); + return $school_crawl_log['out']['plain']; +} + +/** + * \brief * Parse a simple time string into slate_permutate's time * representation. * @@ -150,6 +235,8 @@ function school_crawl_days_str_format($d * The URL to fetch. If a redirect occurs, this is updated. * \param $cookies * An associative array of cookies and where to save new cookies. + * \param $school_crawl_log + * The school_crawl_log handle to use. * \param $post * If not NULL, causes an HTTP POST. In that case, should be an * associative array of form keys/values. @@ -161,8 +248,6 @@ function school_crawl_days_str_format($d * A function which is passed a curl handle which allows the caller * to do silly things like setting CURLOPT_SSLVERSION for silly * sites like ccbcmd's registration site. - * \param $verbosity - * How verbose to be. * \param $loopspin * An internal variable to prevent us from following perpetual * redirects. @@ -171,23 +256,17 @@ function school_crawl_days_str_format($d * malformed HTML, especially with Calvin's WebAdvisor * installation). */ -function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $verbosity = 0, $loopspin = 0) +function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0) { - global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf, $school_crawl_geturi_verbosity; + global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf; - if ($verbosity > 5) - { - echo "\n"; - echo 'school_crawl_geturi(' . $uri . ")\n"; - echo "\n"; - } + school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri); $curl = curl_init(); if ($curlsetup_hook !== NULL) $curlsetup_hook($curl); - $school_crawl_geturi_verbosity = $verbosity; $school_crawl_geturi_write_buf = ''; $school_crawl_geturi_headers_buf = ''; curl_setopt($curl, CURLOPT_URL, $uri); @@ -200,8 +279,7 @@ function school_crawl_geturi(&$uri, &$co $cookies_str .= $key . '=' . $val; } - if ($verbosity > 8) - echo 'cookies sent: ' . $cookies_str . "\n"; + school_crawl_logf($school_crawl_log, 10, "cookies sent: %s", $cookies_str); curl_setopt($curl, CURLOPT_COOKIE, $cookies_str); curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'school_crawl_geturi_header_cb'); curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'school_crawl_geturi_write_cb'); @@ -236,8 +314,7 @@ function school_crawl_geturi(&$uri, &$co $posttxt .= (strlen($posttxt) ? '&' : '') . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval); } - if ($verbosity > 8) - echo 'setting POST to ' . $posttxt . "\n"; + school_crawl_logf($school_crawl_log, 10, "Setting POST to %s", $posttxt); /* curl_setopt($curl, CURLOPT_POST, TRUE); */ curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt); @@ -256,20 +333,15 @@ function school_crawl_geturi(&$uri, &$co continue; list($header_name, $header_val) = explode(': ', $header, 2); - if ($verbosity > 8) - echo $header_name . ' : ' . $header_val . "\n"; + school_crawl_logf($school_crawl_log, 9, "%s: %s", $header_name, $header_val); switch($header_name) { case 'Set-Cookie': list($cookie_name, $cookie_val) = explode('=', $header_val, 2); - if ($verbosity > 9) - { - if (isset($cookies[$cookie_name])) - echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name] - . ' with '; - echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n"; - } + if (isset($cookies[$cookie_name])) + school_crawl_logf($school_crawl_log, 10, "Replacing cookie %s=%s with...", $cookie_name, $cookies[$cookie_name]); + school_crawl_logf($school_crawl_log, 10, "...new cookie %s=%s.", $cookie_name, $cookie_val); $cookies[$cookie_name] = $cookie_val; break; @@ -291,11 +363,10 @@ function school_crawl_geturi(&$uri, &$co && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv'))) { $meta_content = $meta_node->getAttribute('content'); - if ($verbosity > 2) - echo 'Following http-equiv Refresh: ' . $meta_content . PHP_EOL; + school_crawl_logf($school_crawl_log, 7, "Following http-equiv Refresh: %s", $meta_content); if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches))) { - echo 'Error following http-equiv Refresh: ' . $meta_content . PHP_EOL; + school_crawl_logf($school_crawl_log, 0, "Error following http-equiv Refresh: %s", $meta_content); } else { @@ -305,12 +376,11 @@ function school_crawl_geturi(&$uri, &$co } } - if ($verbosity > 9) - echo $school_crawl_geturi_write_buf; + school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf); if ($location && $loopspin < 6) { $uri = $location; - return school_crawl_geturi($uri, $cookies, $post, $follow_meta_refresh, $curlsetup_hook, $verbosity, $loopspin + 1); + return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1); } return $school_crawl_geturi_write_buf; } diff --git a/school.d/calvin.crawl.inc b/school.d/calvin.crawl.inc --- a/school.d/calvin.crawl.inc +++ b/school.d/calvin.crawl.inc @@ -25,10 +25,10 @@ * \param $semesters * An array to be filled with Semester objects which I should * populate. - * \param $verbosity - * How verbose I should be. Sensicle range is from 0 through 10. + * \param $school_crawl_log + * A school_crawl_log handle. */ -function calvin_crawl(array &$semesters, $verbosity = 1) +function calvin_crawl(array &$semesters, &$school_crawl_log) { /** * The first link we start at is the one from KV into WebAdvisor. @@ -52,19 +52,16 @@ function calvin_crawl(array &$semesters, $baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL'; $token_uri = $baseuri . '&TOKENIDX=NULL'; - $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies)); + $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies, $school_crawl_log)); if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches)) { - fprintf(STDERR, "Could not steal the token\n"); + school_crawl_logf($school_crawl_log, 1, "Could not steal the token: crawling failed."); return 1; } $token = $matches[1]; - if ($verbosity > 5) - { - echo 'token: ' . $token . "\n"; - echo "\n"; - } + school_crawl_logf($school_crawl_log, 7, "token: %s.", $token); + school_crawl_logf($school_crawl_log, 7, ""); /* * here we have arrived at the main webadvisor screen which lists the @@ -73,7 +70,7 @@ function calvin_crawl(array &$semesters, * individual department for courses. */ $uri = $baseuri . '&TOKENIDX=' . $token; - $departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies)); + $departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log)); $departments_dom = new DOMDocument(); $departments_dom->loadHTML($departments_html); @@ -123,8 +120,7 @@ function calvin_crawl(array &$semesters, $return_url = dom_input_value($departments_dom, 'RETURN.URL'); - if ($verbosity > 4) - fprintf(STDERR, "Available semesters: %s\n", implode($semester_strs, ', ')); + school_crawl_logf($school_crawl_log, 7, "Available semesters: %s.", implode($semester_strs, ', ')); $semester_start_uri = $uri; @@ -139,7 +135,7 @@ function calvin_crawl(array &$semesters, { if (empty($season_map[substr($semester_str, 3)])) { - fprintf(STDERR, "Warning: Unknown semester identification chars: %s. Skipping this semester.\n", + school_crawl_logf($school_crawl_log, 6, "Warning: Unknown semester identification chars: %s. Skipping this semester.", $semester_str); continue; } @@ -159,9 +155,8 @@ function calvin_crawl(array &$semesters, $course_level = ''; $uri = $semester_start_uri; - if ($verbosity) - fprintf(STDERR, "Crawling semester %s->%s\n", - $semester_str, $semester_info); + school_crawl_logf($school_crawl_log, 6, "Crawling semester %s->%s.", + $semester_str, $semester_info); /* * LIST.VAR_: is the column, is the row. There @@ -247,7 +242,7 @@ function calvin_crawl(array &$semesters, $pages = array(1 => 0, 2=> 1); while ($pages[1] < $pages[2]) { - $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $form)); + $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log, $form)); $results_dom = new DOMDocument(); $results_dom->loadHTML($html); @@ -281,16 +276,13 @@ function calvin_crawl(array &$semesters, if (preg_match(';\(([0-9]+)\);', $sec_short_title, $matches)) $synonym = $matches[1]; - if ($verbosity > 6) - { - echo "\n"; - echo implode('-', $section_id) . ': ' . $sec_short_title . "\n"; - echo $openness . "\n"; - echo $sec_meeting_info . "\n"; - echo $faculty_name . "\n"; - echo $credits . "\n"; - echo $comment . "\n"; - } + school_crawl_logf($school_crawl_log, 10, ""); + school_crawl_logf($school_crawl_log, 10, implode('-', $section_id) . ': ' . $sec_short_title); + school_crawl_logf($school_crawl_log, 10, $openness); + school_crawl_logf($school_crawl_log, 10, $sec_meeting_info); + school_crawl_logf($school_crawl_log, 10, $faculty_name); + school_crawl_logf($school_crawl_log, 10, $credits); + school_crawl_logf($school_crawl_log, 10, $comment); /* * The input format for this is, thankfully, pretty rigid @@ -319,10 +311,9 @@ function calvin_crawl(array &$semesters, if (strpos($sec_meeting_info, 'Times to be Announced') !== FALSE || strpos($sec_meeting_info, 'Days to be Announced') !== FALSE) { - if ($verbosity > 2) - error_log('Skipping class because of incomplete meeting time information: ' - . implode('-', $section_id) . ' has meeting info of `' - . $sec_meeting_info . '\''); + school_crawl_logf($school_crawl_log, 8, 'Skipping class because of incomplete meeting time information: ' + . implode('-', $section_id) . ' has meeting info of `' + . $sec_meeting_info . '\''); $skipped_sections['incomplete meeting info'] ++; /* Still add to have less confusing autocomplete */ calvin_crawl_course_add($semester, $section_id['department'], $section_id['course']); @@ -331,8 +322,8 @@ function calvin_crawl(array &$semesters, if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) (([^ ,]+ )+)([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches)) { - error_log('Unable to parse calvin section meeting info string into start/end/days information for ' - . implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\''); + school_crawl_logf($school_crawl_log, 8, 'Unable to parse calvin section meeting info string into start/end/days information for ' + . implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\''); $skipped_sections['invalid meeting info format'] ++; /* * Still add at least the course to the semester so that @@ -351,9 +342,8 @@ function calvin_crawl(array &$semesters, $time_end = school_crawl_time_format(strptime($meeting_info_matches[7], '%I:%M%p')); $meeting_place = $meeting_info_matches[8]; - if ($verbosity > 5) - foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place', 'meeting_type') as $var) - echo $var . ':' . ${$var} . "\n"; + foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place', 'meeting_type') as $var) + school_crawl_logf($school_crawl_log, 10, "%s:%s", $var, ${$var}); $section = new Section($section_id['section'], array(new SectionMeeting($days, $time_start, $time_end, $meeting_place, $meeting_type)), $synonym, $faculty_name); $semester->section_add($section_id['department'], $section_id['course'], $section); @@ -380,14 +370,11 @@ function calvin_crawl(array &$semesters, if (!preg_match(';Page ([0-9]+) of ([0-9]+)\$;m', $html, $pages)) { - error_log('Unable to determine the number of pages in this Calvin resultset'); + school_crawl_logf($school_crawl_log, 0, 'Unable to determine the number of pages in this Calvin resultset'); break; } - if ($verbosity > 0) - { - echo 'calvin_crawl(): finished page ' . $pages[1] . ' of ' . $pages[2] . ' with ' . ($list_row - 1) . " courses.\n"; - } + school_crawl_logf($school_crawl_log, 8, "calvin_crawl(): finished page %d of %d with %d courses.", $pages[1], $pages[2], $list_row - 1); $form = array( 'ACTION*Grp:WSS.COURSE.SECTIONS' => 'NEXT', @@ -395,23 +382,21 @@ function calvin_crawl(array &$semesters, } $has_stat = FALSE; - if ($verbosity > 1) - foreach ($skipped_sections as $reason => $num) - { - if (!$num) - continue; - if (!$has_stat) - error_log('Skipped some sections for : :'); - error_log($reason . ': ' . $num); - } + foreach ($skipped_sections as $reason => $num) + { + if (!$num) + continue; + if (!$has_stat) + school_crawl_logf($school_crawl_log, 7, 'Skipped some sections for : :'); + school_crawl_logf($school_crawl_log, 7, "%s: %d", $reason, $num); + } $semester->time_end_set($semester_end_max); $semester->time_start_set($semester_start_min); $semesters[] = $semester; - if ($verbosity) - fprintf(STDERR, "\n"); + school_crawl_logf($school_crawl_log, 6, ""); } return 0; diff --git a/school.d/ccbcmd.crawl.inc b/school.d/ccbcmd.crawl.inc --- a/school.d/ccbcmd.crawl.inc +++ b/school.d/ccbcmd.crawl.inc @@ -24,12 +24,12 @@ * * \param $semester * The Semester object which I should populate. - * \param $verbosity - * A scale from 0 to 10 determining how loud I should be. + * \param $school_crawl_log + * The school_crawl_log handle. * \return * 1 on failure, 0 on success. */ -function ccbcmd_crawl(array &$semesters, $verbosity = 1) +function ccbcmd_crawl(array &$semesters, &$school_crawl_log) { $cookies = array(); @@ -41,11 +41,11 @@ function ccbcmd_crawl(array &$semesters, */ $uri = 'http://ccbcmd.edu/schedule/sched.html'; $semesters_dom = new DOMDocument(); - $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, NULL, TRUE, 'ccbcmd_crawl_curlhook', $verbosity)); + $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, TRUE, 'ccbcmd_crawl_curlhook')); $semesters_select_node = $semesters_dom->getElementById('term_input_id'); if ($semesters_select_node === NULL) { - fprintf(STDERR, "Could not get list of available semesters to choose from\n"); + school_crawl_logf($school_crawl_log, 0, "Could not get list of available semesters to choose from."); return 1; } @@ -54,7 +54,7 @@ function ccbcmd_crawl(array &$semesters, $semesters_form = school_crawl_element_ancestor($semesters_select_node, 'form'); if ($semesters_form === NULL) { - fprintf(STDERR, "Unable to find
associated with semester.\n"); + school_crawl_logf($school_crawl_log, 0, "Unable to find associated with semester."); return 1; } $semesters_post_save = school_crawl_form($semesters_form); @@ -79,13 +79,11 @@ function ccbcmd_crawl(array &$semesters, if (preg_match(';session ([0-9]+);i', $semester_text, $matches)) $semester_season .= '_' . $matches[1]; - if ($verbosity) - fprintf(STDERR, "Crawling semester %s:%s -> %s.\n", $semester_year, $semester_season, $semester_text); + school_crawl_logf($school_crawl_log, 6, "Crawling semester %s:%s -> %s.", $semester_year, $semester_season, trim($semester_text)); $semester = new Semester($semester_year, strtolower($semester_season)); - if ($verbosity > 1) - fprintf(STDERR, "Found semester: %s=``%s''=``%s''.\n", - $semester_value, $semester->id(), trim($semesters_option_node->textContent)); + school_crawl_logf($school_crawl_log, 8, "Found semester: %s=``%s''=``%s''.", + $semester_value, $semester->id(), trim($semesters_option_node->textContent)); /* load stored semester-page URI / form data */ $semesters_post = $semesters_post_save; $uri = $semester_stage_uri; @@ -93,12 +91,12 @@ function ccbcmd_crawl(array &$semesters, $subjects_dom = new DOMDocument(); $uri = school_crawl_url($uri, $semesters_form->getAttribute('action')); - $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook', $verbosity)); + $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook')); $subjects_form_nodelist = $subjects_dom->getElementsByTagName('form'); if (!$subjects_form_nodelist->length) { - fprintf(STDERR, "Unable to find to submit for the subjects choosing page.\n"); + school_crawl_logf($school_crawl_log, 0, "Unable to find to submit for the subjects-choosing page."); return 1; } $subjects_form_node = $subjects_form_nodelist->item(0); @@ -111,7 +109,7 @@ function ccbcmd_crawl(array &$semesters, $courses_dom = new DOMDocument(); $uri = school_crawl_url($uri, $subjects_form_node->getAttribute('action')); - $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook', $verbosity)); + $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook')); $courses_xpath = new DOMXPath($courses_dom); @@ -119,7 +117,7 @@ function ccbcmd_crawl(array &$semesters, $tr_header_nodelist = $courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr[position()=2]'); if (!$tr_header_nodelist->length) { - fprintf(STDERR, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns.\n"); + school_crawl_logf($school_crawl_log, 0, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns."); return 1; } $tr_header_node = $tr_header_nodelist->item(0); @@ -142,13 +140,12 @@ function ccbcmd_crawl(array &$semesters, { if ($value === FALSE) { - fprintf(STDERR, "Unable to find column offset for `%s'.\n", + school_crawl_logf($school_crawl_log, 0, "Unable to find column offset for `%s'.", $name); return 1; } else - if ($verbosity > 6) - echo $name . ' -> ' . $value . PHP_EOL; + school_crawl_logf($school_crawl_log, 9, "%s -> %s", $name, $value); $max_offset = max($max_offset, $value); } @@ -202,7 +199,7 @@ function ccbcmd_crawl(array &$semesters, } if (($dash_pos = strpos($time_range_text, '-')) === FALSE) { - fprintf(STDERR, "Unable to understand course's time range format, cannot find dash: ``%s''.\n", + school_crawl_logf($school_crawl_log, 0, "Unable to understand course's time range format, cannot find dash: ``%s''.", $time_range_text); return 1; } @@ -219,14 +216,14 @@ function ccbcmd_crawl(array &$semesters, */ if (strpos($time_end_text, '-') !== FALSE) { - fprintf(STDERR, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.\n", + school_crawl_logf($school_crawl_log, 0, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.", $time_range_text, $time_end_text); return 1; } $time_end = strptime($time_end_text, '%I:%M %p'); if ($time_end === FALSE || $time_start === FALSE) { - fprintf(STDERR, "Error parsing start or end time: start: ``%s'' end: ``%s''.\n", + school_crawl_logf($school_crawl_log, 0, "Error parsing start or end time: start: ``%s'' end: ``%s''.", $time_start_text, $time_end_text); return 1; } diff --git a/school.d/cedarville.crawl.inc b/school.d/cedarville.crawl.inc --- a/school.d/cedarville.crawl.inc +++ b/school.d/cedarville.crawl.inc @@ -53,30 +53,28 @@ function table_parse($html) } /** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */ -function cedarville_crawl(array &$semesters, $verbosity = 1) +function cedarville_crawl(array &$semesters, &$school_crawl_log) { $basepath = 'http://cedarville.edu/courses/schedule/'; - if ($verbosity) - echo "cedarville_crawl(): Beginning crawl of Cedarville:\n"; + school_crawl_logf($school_crawl_log, 6, "Beginning crawl of Cedarville:"); - if ($verbosity > 1) - echo "cedarville_crawl(): Determining list of departments.\n"; + school_crawl_logf($school_crawl_log, 7, "Determining list of departments."); - if ($verbosity > 1) - fprintf(STDERR, "cedarville_crawl(): Determining list of semesters.\n"); + school_crawl_logf($school_crawl_log, 8, "Determining list of semesters."); $semesters_dom = new DOMDocument(); $semesters_dom->loadHTML(file_get_contents($basepath)); $content_div_dom = $semesters_dom->getElementById('contenttext'); if (!$content_div_dom) { - fprintf(STDERR, "cedarville_crawl(): Error finding location of the list of departments.\n"); + school_crawl_logf($school_crawl_log, 6, "Error finding location of the list of departments."); if (count($semesters)) { - fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n"); + school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached."); return 0; } + school_crawl_logf($school_crawl_log, 0, "Couldn't find any departments."); return 1; } $departments_xpath = new DOMXPath($semesters_dom); @@ -97,36 +95,35 @@ function cedarville_crawl(array &$semest $semester = new Semester($semester_year, $semester_season); - if ($verbosity > 1) - fprintf(STDERR, "cedarville_crawl(): Crawling semester: %s.\n", - $semester_name); + school_crawl_logf($school_crawl_log, 6, "Crawling semester: %s.", + $semester_name); /* * We need two passes because the first department's code name is * not accessible available in the first pageload. */ $departments = array(); - if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0])) + if (cedarville_crawl_departments_get($basepath . $semester_href, $departments, $semester_href_parts[0], $school_crawl_log)) return 1; if (!count($departments)) { - echo "cedarville_crawl(): Unable to get a listing of departments.\n"; + school_crawl_logf($school_crawl_log, 6, "Unable to get a listing of departments."); if (count($semesters)) { - fprintf(STDERR, "cedarville_crawl(): Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached.\n"); + school_crawl_logf($school_crawl_log, 6, "Assuming that I got enough info anyways, returning successful code so that the few semesters I was able to crawl will be cached."); return 0; } + school_crawl_logf($school_crawl_log, 0, "Unable to get listing of departments."); return 1; } /* find the first department whose name we don't yet know */ - if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0])) + if (cedarville_crawl_departments_get($basepath . $semester_href_parts[0] . '_' . current(array_keys($departments)) . '_all.htm', $departments, $semester_href_parts[0], $school_crawl_log)) return 1; $tables = array(); foreach ($departments as $department => $dept_name) { - if ($verbosity > 2) - echo 'cedarville_crawl(): Crawling department ' . $department . ' (' . $dept_name . ")...\n"; + school_crawl_logf($school_crawl_log, 7, "Crawling department %s (%s).", $department, $dept_name); $html = file_get_contents($basepath . $semester_href_parts[0] . '_' . $department . '_' . 'all.htm'); if (!$html) continue; @@ -183,8 +180,8 @@ function cedarville_crawl(array &$semest $section_parts = Section::parse($course_table[1]); if (count($section_parts) < 3) { - error_log('Error parsing section_id. Given `' . $course_table[1] . '\', interpreted as `' - . implode('-', $section_parts) . '\'. Skipping.'); + school_crawl_logf($school_crawl_log, 6, "Error parsing section_id. Given `%s'; interpreted as `%s'. Skipping.", + $course_table[1], implode('-', $section_parts)); continue; } @@ -199,8 +196,7 @@ function cedarville_crawl(array &$semest $meetings_str = $course_table[6]; if (strpos($meetings_str, 'TBA') !== FALSE) { - if ($verbosity > 1) - error_log('Skipping ' . implode('-', $section_parts) . ' because its meeting time info has `TBA\' in it.'); + school_crawl_logf($school_crawl_log, 8, "Skipping %s because its meeting time info has `TBA' in it.", implode('-', $section_parts)); continue; } $meetings = array(); @@ -213,17 +209,18 @@ function cedarville_crawl(array &$semest if (preg_match(';^Dates:[^0-9]+([/0-9]{8})-([/0-9]{8});', $meetings_str, $meeting_matches)) { - if ($verbosity > 4) - error_log('Skipping some meeting data for ' - . implode('-', $section_parts) . ' because it is a date range: `' - . $meeting_matches[0] . '\''); + /** + * \todo + * This is a perfect place to get Semester's + * time_start and time_end values. + */ + school_crawl_logf($school_crawl_log, 8, "Skipping some meeting data for %s because it is a date range: `%s'.", + implode('-', $section_parts), $meeting_matches[0]); $meetings_str = substr($meetings_str, strlen($meeting_matches[0])); continue; } - if ($verbosity > 0) - error_log('Error parsing meeting time. Given `' . $meetings_str . '\'. Skipping ' - . implode('-', $section_parts)); + school_crawl_logf($school_crawl_log, 6, "Error parsing meeting time. Given `%s'. Skipping %s.", $meetings_str, implode('-', $section_parts)); break; } /* prepare for parsing the next meeting time */ @@ -263,13 +260,13 @@ function cedarville_crawl(array &$semest * An associative array mapping department codes onto department * friendly names. */ -function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string) +function cedarville_crawl_departments_get($dept_url, array &$departments, $season_string, $school_crawl_log) { $html = file_get_contents($dept_url); $dept_dom = new DOMDocument(); if (!$dept_dom->loadHTML(cedarville_html_fix($html))) { - echo "cedarville_crawl(): Error determining list of available departments: Unable to parse HTML.\n"; + school_crawl_logf($school_crawl_log, 6, "Error determining list of available departments: Unable to parse HTML."); return 1; } $xpath = new DOMXPath($dept_dom); @@ -280,7 +277,7 @@ function cedarville_crawl_departments_ge $href = $dept_node->getAttribute('href'); if (!preg_match('/^' . preg_quote($season_string, '/') . '_([a-z]+)_[a-z]+\.htm$/', $href, $matches)) { - echo 'cedarvillege_crawl(): Error determining list of available departments: Unable to parse the department string out of href="' . $href . "\".\n"; + school_crawl_logf($school_crawl_log, 6, "cedarvillege_crawl(): Error determining list of available departments: Unable to parse the department string out of href=\"%s\".", $href); return 1; }