diff --git a/school.d/umich.crawl.inc b/school.d/umich.crawl.inc --- a/school.d/umich.crawl.inc +++ b/school.d/umich.crawl.inc @@ -19,52 +19,65 @@ */ /** - * \brief - * Crawls University of Michigan's schedule. + * \file + * + * All of the code for crawling umich. * * Potential startpoints: * - http://lsa.umich.edu/cg/cg_advsearch.aspx (HTML/curl-based) * - http://ro.umich.edu/schedule/ (harder HTML for semester guessing, one CSV download for entire semester -- <=4MB) * + * A single download, the CSV option, is preferred to having to issue + * a series of HTTP requests. Each HTTP request has a lot of latency + * and overhead which a one-shot download doesn't. + */ + +/** + * \brief + * Retrieve the list of semesters umich has available for crawling. + * + * \todo + * Some error handling. + * + * \param $school + * The school handle for umich. * \param $semesters - * An array to be filled with semesters. - * \param $school_crawl_log - * The school_crawl_log handle. + * An array to which Semester objects should be appended, one for + * each potentially crawlable semester. * \return - * 1 on failure, 0 on success. + * 0 on success, 1 on failure. */ -function umich_crawl(array &$semesters, $school_crawl_log) +function umich_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log) { - $url = 'http://ro.umich.edu/schedule/'; + $uri = 'http://ro.umich.edu/schedule/'; $cookies = array(); - /* determine list of semesters: */ $semesters_dom = new DOMDocument(); - $semesters_dom->loadHTML(school_crawl_geturi($url, $cookies, $school_crawl_log)); + $semesters_dom->loadHTML(school_crawl_geturi($uri, $cookies, $school_crawl_log)); $semesters_xpath = new DOMXPath($semesters_dom); - $tables_nodelist = $semesters_dom->getElementsByTagName('table'); - foreach ($tables_nodelist as $table) + foreach ($semesters_dom->getElementsByTagName('table') as $table) { $table_tr = NULL; foreach ($semesters_xpath->query('tr', $table) as $table_tr) break; if (empty($table_tr)) { - school_crawl_logf($school_crawl_log, 5, "Unable to find first row in table which I suspect to be a table holding all of the semesters I'm interested in."); + school_crawl_logf($school_crawl_log, 5, "Unable to find first row in table which I suspect is a table holding all of the semesters I'm interested in. I will try any other tables in this page and hopefully find one with a row in it..."); continue; } $semester_columns = array( - 'name' => school_crawl_table_resolve_column($table_tr, 'Term'), - 'csv' => school_crawl_table_resolve_column($table_tr, '/[cC][sS][vV]/', 'school_crawl_table_resolve_column_regexcmp'), - ); + 'name' => school_crawl_table_resolve_column($table_tr, 'Term'), + 'csv' => school_crawl_table_resolve_column($table_tr, '/[cC][sS][vV]/', 'school_crawl_table_resolve_column_regexcmp'), + ); foreach ($semester_columns as $semester_column_name => $semester_column) if ($semester_column === FALSE) { - school_crawl_logf($school_crawl_log, 4, "Unable to resolve column %s onto a column in a semester listing table. Skipping this table.", + school_crawl_logf($school_crawl_log, 4, "Unable to resolve columns %s onto a column in a semester listing table. Skipping this table.", $semester_column_name); $semester_columns = NULL; + break; } if (empty($semester_columns)) continue; @@ -73,6 +86,7 @@ function umich_crawl(array &$semesters, foreach ($semesters_xpath->query('tr', $table) as $table_tr) if ($first) { + /* Skip row of or titles. */ $first = FALSE; continue; } @@ -88,33 +102,24 @@ function umich_crawl(array &$semesters, $semester_name->textContent); continue; } + $semester = new Semester($matches[2], $matches[1]); - $a = NULL; foreach ($semesters_xpath->query('descendant::a', $semester_csv) as $a) - break; + if ($a->hasAttribute('href')) + break; if (empty($a) || !$a->hasAttribute('href')) { - school_crawl_logf($school_crawl_log, 4, "Unable to find element with an href attribute for a CSV link for the %s semester. Skipping this semester. (textContent of csv column: %s)", + school_crawl_logf($school_crawl_log, 4, "Unable to find element with an href attribute for a CSV link for the %s semester. Skipping this semester. (textContent of CSV column: %s).", $semester, $semester_csv->textContent); continue; } - if (!umich_crawl_csv($school_crawl_log, $semester, $a->getAttribute('href'))) - { - $semesters[] = $semester; - /** - * \todo - * If we try to crawl more than one umich semester, - * PHP runs out of memory. We need to bump our API - * and rehash script to support incremental crawling - * or early data committing if we want umich - * crawling to work for more than one semester. - */ - return 0; - } - else - school_crawl_logf($school_crawl_log, 2, "Unable to interpret CSV information for %s. Skipping semester.", - $semester); + /* + * Secretively communicate some metadata to + * umich_crawl_semester(). + */ + $semester->umich_csv_href = $a->getAttribute('href'); + $semesters[] = $semester; } } @@ -132,13 +137,13 @@ function umich_crawl(array &$semesters, * \param $csv_href * A link to a CSV file which will be downloaded and parsed. */ -function umich_crawl_csv($school_crawl_log, &$semester, $csv_href) +function umich_crawl_semester(array $school, Semester $semester, &$school_crawl_log) { school_crawl_logf($school_crawl_log, 3, "Crawling %s.", $semester); $cookies = array(); - $uri = $csv_href; + $uri = $semester->umich_csv_href; /* parse into lines and then each row needs to be individually parsed */ $csv = str_getcsv(school_crawl_geturi($uri, $cookies, $school_crawl_log), PHP_EOL); @@ -174,7 +179,6 @@ function umich_crawl_csv($school_crawl_l 'Acad Group' => TRUE, 'Codes' => TRUE, 'SU' => TRUE, - 'Units' => TRUE, ); foreach (str_getcsv($csv[0]) as $col_num => $col_name) @@ -197,6 +201,7 @@ function umich_crawl_csv($school_crawl_l unset($csv[0]); /* Now actually parse some data :-). */ + $row_accumulation = array('Instructor' => ''); foreach ($csv as $row) { $row = str_getcsv($row); @@ -211,6 +216,18 @@ function umich_crawl_csv($school_crawl_l } $dept = $matches[1]; + /** + * \todo + * umich stores sometimes ranges of credit hours for courses, + * formatted like "1.00-3.00". This is generally done for ARR + * courses, where there is negotiation between the faculty and + * the student on how the course is arranged. slate_permutate + * should have a concept of a range of credit hours, then when + * calculating credit hours for the user it can present the + * total as a range... not that hard, but still a task ;-). + */ + $credit_hours = (float)$row[$fields['Units']]; + $days = ''; foreach (array('M' => 'm', 'T' => 't', 'W' => 'w', 'TH' => 'h', 'F' => 'f', 'S' => 's') as $field => $day) @@ -219,10 +236,29 @@ function umich_crawl_csv($school_crawl_l if (!preg_match(';^([0-9]+)-([0-9]+)([AP])M$;', $row[$fields['Time']], $matches)) { - school_crawl_logf($school_crawl_log, 4, "Unable to parse meeting time: `%s'. Skipping section/meeting (synonym=%s).", - $row[$fields['Time']], $synonym); + /* + * Some courses exist but only have sections which have ARR + * for their meeting times. I think this means sometimes + * that the student is to arrange the course meeting with + * the instructor, other times just that the course is + * planned but not scheduled yet. These courses should still + * show up in autocomplete even if they have no meeting + * times. + */ + + if ($row[$fields['Time']] != 'ARR') + /* Log an unanticipated Time value */ + school_crawl_logf($school_crawl_log, 4, "Unable to parse meeting time: `%s'. Skipping section/meeting (synonym=%s).", + $row[$fields['Time']], $synonym); /* ensure that the class is added nonetheless */ if ($semester->class_get($dept, $course_id) === NULL) + /** + * \todo + * SP does credit hours by section, what about Courses + * with no sections because they're these weird limbo + * `ARR' courses but these limbo courses still have a + * number of credit hours? + */ $semester->class_add(new Course($dept . '-' . $course_id, $row[$fields['Course Title']])); continue; } @@ -231,13 +267,22 @@ function umich_crawl_csv($school_crawl_l /* umich defines course_slots by meeting_type. */ $meeting_type = school_crawl_meeting_type(trim($row[$fields['Component']])); + /* + * Some information is only presented in the first row in a + * listing of courses. Perform some accumulation here. + */ + foreach (array('Instructor') as $key) + if (strlen($curr_value = trim($row[$fields[$key]]))) + $row_accumulation[$key] = $curr_value; + $semester->section_meeting_add($dept, $course_id, trim($row[$fields['Course Title']]), trim($row[$fields['Section']]), $synonym, new SectionMeeting($days, $time_start, $time_end, trim($row[$fields['Location']]), $meeting_type, - trim($row[$fields['Instructor']])), - $meeting_type); + $row_accumulation['Instructor']), + $meeting_type, + $credit_hours); } }