# HG changeset patch # User Nathan Phillip Brink # Date 2012-02-14 22:20:03 # Node ID 638ed591b75376c5636d59b935acd2c99278d216 # Parent 556c9319aa6548ddd3b26278041ca5efdaaa365e Add basic crawler support for Hope College in light of their winning the Calvin v. Hope basketball game last weekend. The crawler is unable to, from the data provided by hope, properly recognize and handle course labs in any fashion. To do that, the human-formatted comment fields will need to be parsed. These fields are placed in the same column as the course title is and are in the subsequent row for a particular section -- these fields are currently ignored through the rule which throws out rows with too few columns. diff --git a/school.d/hope.crawl.inc b/school.d/hope.crawl.inc new file mode 100644 --- /dev/null +++ b/school.d/hope.crawl.inc @@ -0,0 +1,325 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +/** + * \brief + * Start a Hope crawling session. + */ +function _hope_crawl_start(array $school, &$uri, array &$cookies, &$dom, &$xpath, &$school_crawl_log) +{ + $cookies = array(); + $uri = 'http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule'; + $dom = new DOMDocument(); + + $html = school_crawl_geturi($uri, $cookies, $school_crawl_log); + if (empty($html) + || !$dom->loadHTML($html)) + { + school_crawl_logf($school_crawl_log, 2, "Unable to load the HTML document necessary to enumerate %s's list of semesters.", + $school['id']); + return 1; + } + + $xpath = new DOMXPath($dom); + + return 0; +} + +/** + * \brief + * Crawl the list of available semesters from Hope. + * + * Crawling starts at + * http://plus.hope.edu/PROD/hxskschd.P_hopeSchedule . This is linked + * to from http://hope.edu/registrar/nav/schedules.html and from + * http://plus.hope.edu/ (which redirects to a PROD page which has + * `Release 8.4.2'. The HTTP server claims to be ``Server: + * Oracle-Application-Server-10g/10.1.2.0.2 Oracle-HTTP-Server''. + * + * \param $school + * The school handle for Hope College. + * \param $semesters + * The array to which Semester objects shall be appended. + * \param $school_crawl_log + * The school_crawl_log handle. + */ +function hope_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log) +{ + $uri = NULL; + $cookies = array(); + $dom = NULL; + $xpath = NULL; + + if ($ret = _hope_crawl_start($school, $uri, $cookies, $dom, $xpath, $school_crawl_log)) + return $ret; + + if (($dom_select_terms = $xpath->query('.//select[@name="term"]/option[string-length(@value) > 0]')) === FALSE + || !$dom_select_terms->length) + { + school_crawl_logf($school_crawl_log, 2, "Unable to find the for %s.", + $school['id']); + return 1; + } + + foreach ($dom_select_terms as $dom_select_term) + { + list($season, $year) = explode(' ', strtolower(trim($dom_select_term->textContent))); + if (!strcmp($year, $semester->year_get()) + && !strcmp($season, $semester->season_get()) + && $dom_select_term->hasAttribute('value')) + break; + unset($dom_select_term); + } + if (empty($dom_select_term)) + { + school_crawl_logf($school_crawl_log, 4, "Unable to find the form input value associated with the %s semester.", + $semester); + return 1; + } + + $semester_form_node = school_crawl_element_ancestor($dom_select_term, 'form'); + $semester_form = school_crawl_form($semester_form_node); + $semester_form_action = $semester_form_node->getAttribute('action'); + $semester_form['term'] = $dom_select_term->getAttribute('value'); + + foreach ($xpath->query('.//select[@name="sel_subj"]') as $dom_select_term) + break; + if (empty($dom_select_term)) + { + school_crawl_logf($school_crawl_log, 4, "Unable to find Subject-selecting form input"); + return 1; + } + + /* + * Manually select all of the different sorts of subject materials + * since selecting no subjects doesn't result in listing them all. + */ + $semester_form['sel_subj'] = array(); + foreach (school_crawl_form_select_array($dom_select_term, FALSE) as $subject_name => $junk) + $semester_form['sel_subj'][] = $subject_name; + + if (!empty($semester_form_action)) + $uri = school_crawl_url($uri, $semester_form_action); + $sections_html = school_crawl_geturi($uri, $cookies, $school_crawl_log, $semester_form); + + /* + * Get an HTML-based results page. We only get this page because it + * has a
which we can submit to get CSV. + */ + $sections_dom = new DOMDocument(); + if (empty($sections_html) + || !$sections_dom->loadHTML($sections_html)) + { + school_crawl_logf($school_crawl_log, 2, "Unable to load section listings page."); + return 1; + } + $sections_xpath = new DOMXPath($sections_dom); + + /* Look for the "Export to Excel" submit button */ + $sections_form = $sections_xpath->query('.//form[.//input[@type = "submit" and contains(@value, "xport")]]')->item(0); + if (empty($sections_form)) + { + school_crawl_logf($school_crawl_log, 2, "Unable to find CSV link for schedule."); + return 1; + } + + /* Get the CSV */ + $sections_form_action = $sections_form->getAttribute('action'); + if (!empty($sections_form_action)) + $uri = school_crawl_url($uri, $sections_form_action); + $sections_csv = school_crawl_geturi($uri, $cookies, $school_crawl_log, school_crawl_form($sections_form)); + + /* + * Oracle likes to put random `"' into the middle of a quoted string + * instead of properly escaping it like ``"This is a string with a + * "" in it"''. This regex blasts away such doublequotes which are + * not adjacent to delimiters (hopefully). + */ + $sections_csv = preg_replace('/([^,\\n\\r])"([^,\\n\\r])/', '$1""$2', $sections_csv); + $sections_csv = school_crawl_csv_parse($sections_csv, array('eof' => TRUE)); + /* Skip the introductory lines, seeking for the field headers */ + for ($i = 0; $i < count($sections_csv) && count($sections_csv[$i]) < 2; $i ++) + ; + + $fields = array( + 'Status' => FALSE /*< OPEN, RESTRICTED, IN PROGRESS, or empty */, + 'Title' => FALSE /*< course name */, + 'Subject' => FALSE /*< subject id */, + 'Course Number' => FALSE, + 'Section Number' => FALSE, + 'CRN' => FALSE /*< section synonym */, + 'Cred' => FALSE /*< Number of credits, can be a range which would be formatted like " 1-4" */, + /* + * ex. "FA1", "FA2", "CH2" (online course?), "CD4", "SRS" + * (seniors). If a course has multiple attributes, it will have + * multiple lines following it with the attributes but no other + * fields filled? + */ + 'Attr' => FALSE, + /* + * The first of 8 columns being Day + times. "M" (or "TBA"), "T", + * "W", "R", "F", ?, ?, "1600-1800" or "TBA". + */ + 'Meeting Days/Times' => FALSE, + 'Location' => FALSE /*< The room or TBA */, + 'Capacity' => FALSE /*< Probably the maximum number of students */, + 'Actual' => FALSE /*< Possibly the current number of students? */, + 'Remainder' => FALSE /*< Number of spots to be filled... */, + 'Instructor' => FALSE /*< The prof/instructor */, + /* + * The start/end dates in form of 07/02-07/27. This would be + * particularly important for supporting half-semester + * courses. Bug #122. + */ + 'Date' => FALSE, + 'Weeks' => FALSE /*< The total number of weeks the course meets */, + ); + + foreach ($sections_csv[$i] as $column => $name) + if (!empty($name)) + $fields[$name] = $column; + $expected_columns = max($fields); + foreach ($fields as $name => $location) + if ($location === FALSE) + { + school_crawl_logf($school_crawl_log, 2, "Cannot find column named %s in CSV. The column headings line looks like ``%s''.", + $name, implode(',', $sections_csv[$i])); + return 1; + } + + /* Label the days of the week and Times column */ + foreach (array('M', 'T', 'W', 'R', 'F', 'S', 'U', 'Times') as $offset => $name) + $fields[$name] = $fields['Meeting Days/Times'] + $offset; + + for ($i ++; $i < count($sections_csv); $i ++) + { + $section_csv = $sections_csv[$i]; + + if (count($section_csv) < $expected_columns) + { + school_crawl_logf($school_crawl_log, 8, "Skipping row which has fewer entries than expected (%d): %s", + $expected_columns, implode(', ', $section_csv)); + continue; + } + + /* + * If a section has multiple meetings, each extra meeting is + * placed on a row following the first section's entry. However, + * the course/synonym/section/subject are all blank on that + * line. Therefore, we must propagate these values. + */ + foreach (array( + 'subject_id' => 'Subject', + 'course_id' => 'Course Number', + 'title' => 'Title', + 'section_id' => 'Section Number', + 'synonym' => 'CRN', + 'instructor' => 'Instructor', + 'location' => 'Location', + ) as $var => $field) + if (strlen(trim($section_csv[$fields[$field]]))) + ${$var} = trim($section_csv[$fields[$field]]); + + if ($section_csv[$fields['M']] == 'TBA' + || $section_csv[$fields['Times']] == 'TBA') + { + $semester->class_add(new Course($subject_id . '-' . $course_id, + $section_csv[$fields['Title']])); + school_crawl_logf($school_crawl_log, 8, "Course %s-%s-%s has a section meeting with a TBA time, adding dummy course.", + $subject_id, $course_id, $section_id); + continue; + } + + if (preg_match(',(\\d\\d)/(\\d\\d)-(\\d\\d)/(\\d\\d),', $section_csv[$fields['Date']], $matches)) + { + list(, $m_start, $d_start, $m_end, $d_end) = $matches; + if ($m_start && $d_start && $m_end && $d_end) + { + $y_start = $y_end = $semester->year_get(); + if ($m_end < $m_start) + $y_end ++; + $semester->time_start_set_test(gmmktime(0, 0, 0, $m_start, $d_start, $y_start)); + $semester->time_end_set_test(gmmktime(0, 0, 0, $m_end, $d_end, $y_end)); + } + } + + $days = school_crawl_days_format($school_crawl_log, array_filter(array_slice($section_csv, $fields['M'], 7), '_hope_crawl_days_filter')); + list($time_start, $time_end) = explode('-', $section_csv[$fields['Times']]); + if (strlen($time_start) != 4 || strlen($time_end) != 4) + { + school_crawl_logf($school_crawl_log, 4, "Section meeting (synonym=%s) has invalidly-formatted start time (%s) or end time (%s). Skipping.", + $synonym, $time_start, $time_end); + continue; + } + + /* + * Guessing the type of section_meeting: `attribute' of NSL seems to + * be associated with labs. + */ + $type = 'lecture'; + if ($section_csv[$fields['Attr']] == 'NSL') + $type = 'lab'; + + $section_meeting = new SectionMeeting($days, $time_start, $time_end, + $location, + $type, + $instructor); + $semester->section_meeting_add($subject_id, + $course_id, + $title, + $section_id, + $synonym, + $section_meeting, + 'default', + $section_csv[$fields['Cred']]); + } + return 0; +} diff --git a/school.d/hope.inc b/school.d/hope.inc new file mode 100644 --- /dev/null +++ b/school.d/hope.inc @@ -0,0 +1,28 @@ + + * + * This file is a part of slate_permutate. + * + * slate_permutate is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * slate_permutate is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with slate_permutate. If not, see . + */ + +function hope_info() +{ + return array( + 'name' => 'Hope College', + 'student_address' => 'Flying Dutchman', + 'example_course_id' => 'ENGL-248', + ); +}