# HG changeset patch # User Nathan Phillip Brink # Date 2011-10-10 00:52:51 # Node ID 256650e361e6c477cac0dce3228dfb2d18f524f2 # Parent b183b9a9baebedb7e48a3822e44316d20410bb22 Remove the unused/replace part of the ccbcmd crawler which was used for the old crawler API. diff --git a/school.d/ccbcmd.crawl.inc b/school.d/ccbcmd.crawl.inc --- a/school.d/ccbcmd.crawl.inc +++ b/school.d/ccbcmd.crawl.inc @@ -294,241 +294,6 @@ function ccbcmd_crawl_semester($school, return 0; } -/** - * \brief - * Crawl CCBCMD's registration stuffage. - * - * \param $semesters - * The array of Semester objects which I should populate. - * \param $school_crawl_log - * The school_crawl_log handle. - * \return - * 1 on failure, 0 on success. - */ -function ccbcmd_crawl(array &$semesters, &$school_crawl_log) -{ - $cookies = array(); - - /* - * It seems that http://ccbcmd.edu/schedule/sched.html is what we're - * meant to start from. That's just a redirect to some other page - * from which we get a listing of available semesters and choose - * one. - */ - $uri = 'http://ccbcmd.edu/schedule/sched.html'; - $semesters_dom = new DOMDocument(); - $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, NULL, TRUE, 'ccbcmd_crawl_curlhook')); - $semesters_select_node = $semesters_dom->getElementById('term_input_id'); - if ($semesters_select_node === NULL) - { - school_crawl_logf($school_crawl_log, 0, "Could not get list of available semesters to choose from."); - return 1; - } - - $semester_stage_uri = $uri; - - $semesters_form = school_crawl_element_ancestor($semesters_select_node, 'form'); - if ($semesters_form === NULL) - { - school_crawl_logf($school_crawl_log, 0, "Unable to find
associated with semester."); - return 1; - } - $semesters_post_save = school_crawl_form($semesters_form); - - foreach ($semesters_select_node->childNodes as $semesters_option_node) - { - $semester_text = $semesters_option_node->textContent; - $semester_value = $semesters_option_node->getAttribute('value'); - if (empty($semester_value)) - /* skip the empty ``None'' semester */ - continue; - - if (stripos($semester_text, 'continuing') !== FALSE) - /* skip the year-long semesters dedicated to continuing education */ - continue; - - $semester_text_parts = explode(' ', $semester_text); - $semester_season = $semester_text_parts[0]; - $semester_year = $semester_text_parts[1]; - - /* the college has two separate summer sessions, so distinguish between them */ - if (preg_match(';session ([0-9]+);i', $semester_text, $matches)) - $semester_season .= '_' . $matches[1]; - - school_crawl_logf($school_crawl_log, 6, "Crawling semester %s:%s -> %s.", $semester_year, $semester_season, trim($semester_text)); - $semester = new Semester($semester_year, strtolower($semester_season)); - - school_crawl_logf($school_crawl_log, 8, "Found semester: %s=``%s''=``%s''.", - $semester_value, $semester->id(), trim($semesters_option_node->textContent)); - /* load stored semester-page URI / form data */ - $semesters_post = $semesters_post_save; - $uri = $semester_stage_uri; - $semesters_post[$semesters_select_node->getAttribute('name')] = $semester_value; - - $subjects_dom = new DOMDocument(); - $uri = school_crawl_url($uri, $semesters_form->getAttribute('action')); - $subjects_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $semesters_post, TRUE, 'ccbcmd_crawl_curlhook')); - - $subjects_form_nodelist = $subjects_dom->getElementsByTagName('form'); - if (!$subjects_form_nodelist->length) - { - school_crawl_logf($school_crawl_log, 0, "Unable to find to submit for the subjects-choosing page."); - return 1; - } - $subjects_form_node = $subjects_form_nodelist->item(0); - $subjects_post = school_crawl_form($subjects_form_node); - - $subjects_select_node = $subjects_dom->getElementById('subj_id'); - foreach ($subjects_select_node->childNodes as $subjects_option_node) - if (!strcasecmp('all', trim($subjects_option_node->textContent))) - $subjects_post[$subjects_select_node->getAttribute('name')][] = $subjects_option_node->getAttribute('value'); - - $courses_dom = new DOMDocument(); - $uri = school_crawl_url($uri, $subjects_form_node->getAttribute('action')); - $courses_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log, $subjects_post, TRUE, 'ccbcmd_crawl_curlhook')); - - $courses_xpath = new DOMXPath($courses_dom); - - /* The second row of the table has all of the headers in it */ - $tr_header_nodelist = $courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr[position()=2]'); - if (!$tr_header_nodelist->length) - { - school_crawl_logf($school_crawl_log, 0, "Unable to find the row of the course/section data table which gives us the mappings of column names onto columns."); - return 1; - } - $tr_header_node = $tr_header_nodelist->item(0); - - $section_offsets = array( - 'registration_number' => school_crawl_table_resolve_column($tr_header_node, 'CRN'), - 'section_id' => school_crawl_table_resolve_column($tr_header_node, 'subj/crse/sec'), - /* there's a boolean column which says whether or not the course has any prerequisites/corequisites.... */ - 'credits' => school_crawl_table_resolve_column($tr_header_node, 'credhrs'), - /* there's a column for the number of contact hours, vs. credit hours */ - 'dates' => school_crawl_table_resolve_column($tr_header_node, 'sessiondates'), - ); - foreach (array('title', 'days', 'times', 'instructor', 'location') as $column_key) - $section_offsets[$column_key] = school_crawl_table_resolve_column($tr_header_node, $column_key); - /* there's also a column for ``session dates'' */ - - /* error check and calculate the number of children that a node must have to be */ - $max_offset = 0; - foreach ($section_offsets as $name => $value) - { - if ($value === FALSE) - { - school_crawl_logf($school_crawl_log, 0, "Unable to find column offset for `%s'.", - $name); - return 1; - } - else - school_crawl_logf($school_crawl_log, 9, "%s -> %s", $name, $value); - - $max_offset = max($max_offset, $value); - } - - foreach ($courses_xpath->query('//table[@class="datadisplaytable" and position()=1]//tr') as $tr_node) - { - $children = school_crawl_table_rownodes($tr_node); - if ($children->length < $max_offset) - /* - * Skip this row because it doesn't have all of the columns we - * want and thus it can't be a row containing information - * about a section. - */ - continue; - if (!strcmp($children->item($section_offsets['section_id'])->tagName, 'th')) - /* - * We've hit one of the s filled with s. Skip this one. - */ - continue; - - /* - * There are some rows with the time set to TBA and with empty - * section_id columns. Respond to this by skipping empty - * section_id columns since there's no useful data in these - * rows. We use strlen() < 3 because trim() doesn't take care of - *   :-/ - */ - $section_id = trim($children->item($section_offsets['section_id'])->textContent); - if (strlen($section_id) < 3) - continue; - - $section_id_parts = Section::parse($section_id); - - $registration_number = $children->item($section_offsets['registration_number'])->textContent; - $instructor = $children->item($section_offsets['instructor'])->textContent; - - $section_meetings = array(); - { - $time_range_text = $children->item($section_offsets['times'])->textContent; - if (strpos($time_range_text, 'TBA') !== FALSE) - { - /* - * Add the section to the autocomplete list, just without - * any meeting info (i.e., $section_meetings is still - * empty now). - */ - $semester->section_add($section_id_parts['department'], $section_id_parts['course'], - new Section($section_id_parts['section'], $section_meetings, $registration_number)); - continue; - - } - if (($dash_pos = strpos($time_range_text, '-')) === FALSE) - { - school_crawl_logf($school_crawl_log, 0, "Unable to understand course's time range format, cannot find dash: ``%s''.", - $time_range_text); - return 1; - } - - $time_start_text = substr($time_range_text, 0, $dash_pos); - $time_start = strptime($time_start_text, '%I:%M %p'); - $time_end_text = substr($time_range_text, $dash_pos + 1); - /* - * Make sure that _only_ one date range is specified to ensure - * data integrity. I.e., make sure that the college doesn't - * suddenly support multiple meeting times without our - * anticipating that and then cause us to have invalid - * data. ;-). --binki - */ - if (strpos($time_end_text, '-') !== FALSE) - { - school_crawl_logf($school_crawl_log, 0, "College seems to support multiple meeting times per semester which we don't know how to parse (even though slate_permutate itself can handle this situation): ``%s'' time_end_text: ``%s''.", - $time_range_text, $time_end_text); - return 1; - } - $time_end = strptime($time_end_text, '%I:%M %p'); - if ($time_end === FALSE || $time_start === FALSE) - { - school_crawl_logf($school_crawl_log, 0, "Error parsing start or end time: start: ``%s'' end: ``%s''.", - $time_start_text, $time_end_text); - return 1; - } - - $days = school_crawl_days_str_format($school_crawl_log, $children->item($section_offsets['days'])->textContent); - - $section_meetings[] = new SectionMeeting($days, school_crawl_time_format($time_start), school_crawl_time_format($time_end), - $children->item($section_offsets['location'])->textContent, - $instructor); - - /* check if a semester's date range should be increased */ - $section_dates = $children->item($section_offsets['dates'])->textContent; - if (preg_match(';^([0-9]+)/([0-9]+)-([0-9]+)/([0-9]+)$;', $section_dates, $section_dates_matches)) - { - $semester->time_start_set_test(mktime(0, 0, 0, $section_dates_matches[1], $section_dates_matches[2], $semester->year_get())); - $semester->time_end_set_test( mktime(0, 0, 0, $section_dates_matches[3], $section_dates_matches[4], $semester->year_get())); - } - } - - $semester->section_add($section_id_parts['department'], $section_id_parts['course'], - new Section($section_id_parts['section'], $section_meetings, $registration_number)); - } - - $semesters[] = $semester; - } - - return 0; -} - function ccbcmd_crawl_curlhook(&$curl) { /*