diff --git a/school.d/umich.crawl.inc b/school.d/umich.crawl.inc new file mode 100644 --- /dev/null +++ b/school.d/umich.crawl.inc @@ -0,0 +1,85 @@ +. + */ + + +/** Filter out whitepace items */ +function umich_arrayfilter_callback($item){ + if(ltrim($item) == ''){ + return false; + } + else{ + return true; + } +} + +/** Parse html at URL into array, first row is row headers */ +function umich_table_parse($url) { + $arr = array(); + $dom = new DOMDocument; + $html = file_get_contents($url); + if(!$html){ + return 1; + } + $dom->loadHTML($html); + $dom->preserveWhiteSpace = false; + $tables = $dom->getElementsByTagName('table'); + $rows = $tables->item(3)->getElementsByTagName('tr'); // Get first table on page + foreach ($rows as $rownum => $row) { + if($rownum > 5) { + $cols = $row->getElementsByTagName('td'); + foreach($cols as $colnum => $col){ + $arr[$rownum][$colnum] = $col->nodeValue; + } + } + } + foreach($arr as &$item) { + $item = array_filter($item, "umich_arrayfilter_callback"); + } + + $arr = array_values($arr); // Reindex array + + // Strip navigation and trailing garbage + $arr[count($arr)-3] = NULL; + $arr[count($arr)-2] = NULL; + $arr[count($arr)-1] = NULL; + + $arr = array_filter($arr); + return $arr; +} + +/** Crawls uMich course listings. $season is "f" or "s", year is 2-digit year */ +function umich_crawl($semester) +{ + $year = substr($semester->year_get(), 2); + $season = strtolower(substr($semester->season_get(), 0, 1)); + + /* Current academic departments. Update as needed. */ + $departments = array('AAPTIS','ACABS','AERO','AEROSP','AMCULT','ANTHRARC','ANTHRBIO','ANTHRCUL','AOSS','APPPHYS','ARCH','ARMENIAN','ARTDES','ASIAN','ASIANLAN','ASTRO','AUTO','BCS','BIOINF','BIOLCHEM','BIOLOGY','BIOMEDE','BIOPHYS','CAAS','CEE','CHE','CHEM','CIC','CICS','CJS','CLARCH','CLCIV','CMPLXSYS','COMM','COMP','COMPLIT','CSP','CZECH','DANCE','DUTCH','ECON','EDCURINS','EDUC','EEB','EECS','ELI','ENGLISH','ENGR','ENSCEN','ENVIRON','ESENG','FRENCH','GEOG','GEOSCI','GERMAN','GREEK','GTBOOKS','HBEHED','HISTART','HISTORY','HJCS','HMP','HONORS','INTMED','IOE','ITALIAN','JAZZ','JUDAIC','KINESLGY','LACS','LATIN','LHC','LHSP','LING','MACROMOL','MATH','MATSCIE','MCDB','MECHENG','MEDADM','MEDCHEM','MEMS','MENAS','MFG','MICROBIOL','MILSCI','MKT','MODGREEK','MOVESCI','MUSEUMS','MUSICOL','MUSMETH','MUSTHTRE','NAVARCH','NAVSCI','NERS','NEUROSCI','NRE','NURS','OMS','ORGSTUDY','PAT','PATH','PHARMACY','PHIL','PHRMACOL','PHYSICS','PHYSIOL','POLISH','POLSCI','PORTUG','PSYCH','PUBHLTH','PUBPOL','RCARTS','RCCORE','RCHUMS','RCIDIV','RCLANG','RCNSCI','RCSSCI','REEES','RELIGION','ROMLANG','ROMLING','RUSSIAN','SAC','SAS','SCAND','SEAS','SI','SLAVIC','SOC','SPANISH','STATS','STDABRD','SWC','TCHNCLCM','THEORY','THTREMUS','UC','UKRAINE','UP','WOMENSTD','YIDDISH'); + + $basepath = "http://www.lsa.umich.edu/cg/cg_results.aspx"; + $yearsyn = 1800 + $year; // Weird year synonym name where 2000 == 1800 + $basepath .= "?termArray={$season}_{$year}_${yearsyn}&cgtype=ug"; + $season = strtolower($season); + $tables = array(); + foreach($departments as $department) { + $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000'); + } + return $tables; +} diff --git a/school.d/umich.inc b/school.d/umich.inc --- a/school.d/umich.inc +++ b/school.d/umich.inc @@ -47,69 +47,3 @@ function umich_instructions_html() EOF; } - -/** Filter out whitepace items */ -function umich_arrayfilter_callback($item){ - if(ltrim($item) == ''){ - return false; - } - else{ - return true; - } -} - -/** Parse html at URL into array, first row is row headers */ -function umich_table_parse($url) { - $arr = array(); - $dom = new DOMDocument; - $html = file_get_contents($url); - if(!$html){ - return 1; - } - $dom->loadHTML($html); - $dom->preserveWhiteSpace = false; - $tables = $dom->getElementsByTagName('table'); - $rows = $tables->item(3)->getElementsByTagName('tr'); // Get first table on page - foreach ($rows as $rownum => $row) { - if($rownum > 5) { - $cols = $row->getElementsByTagName('td'); - foreach($cols as $colnum => $col){ - $arr[$rownum][$colnum] = $col->nodeValue; - } - } - } - foreach($arr as &$item) { - $item = array_filter($item, "umich_arrayfilter_callback"); - } - - $arr = array_values($arr); // Reindex array - - // Strip navigation and trailing garbage - $arr[count($arr)-3] = NULL; - $arr[count($arr)-2] = NULL; - $arr[count($arr)-1] = NULL; - - $arr = array_filter($arr); - return $arr; -} - -/** Crawls uMich course listings. $season is "f" or "s", year is 2-digit year */ -function umich_crawl($semester) -{ - $year = substr($semester->year_get(), 2); - $season = strtolower(substr($semester->season_get(), 0, 1)); - - /* Current academic departments. Update as needed. */ - $departments = array('AAPTIS','ACABS','AERO','AEROSP','AMCULT','ANTHRARC','ANTHRBIO','ANTHRCUL','AOSS','APPPHYS','ARCH','ARMENIAN','ARTDES','ASIAN','ASIANLAN','ASTRO','AUTO','BCS','BIOINF','BIOLCHEM','BIOLOGY','BIOMEDE','BIOPHYS','CAAS','CEE','CHE','CHEM','CIC','CICS','CJS','CLARCH','CLCIV','CMPLXSYS','COMM','COMP','COMPLIT','CSP','CZECH','DANCE','DUTCH','ECON','EDCURINS','EDUC','EEB','EECS','ELI','ENGLISH','ENGR','ENSCEN','ENVIRON','ESENG','FRENCH','GEOG','GEOSCI','GERMAN','GREEK','GTBOOKS','HBEHED','HISTART','HISTORY','HJCS','HMP','HONORS','INTMED','IOE','ITALIAN','JAZZ','JUDAIC','KINESLGY','LACS','LATIN','LHC','LHSP','LING','MACROMOL','MATH','MATSCIE','MCDB','MECHENG','MEDADM','MEDCHEM','MEMS','MENAS','MFG','MICROBIOL','MILSCI','MKT','MODGREEK','MOVESCI','MUSEUMS','MUSICOL','MUSMETH','MUSTHTRE','NAVARCH','NAVSCI','NERS','NEUROSCI','NRE','NURS','OMS','ORGSTUDY','PAT','PATH','PHARMACY','PHIL','PHRMACOL','PHYSICS','PHYSIOL','POLISH','POLSCI','PORTUG','PSYCH','PUBHLTH','PUBPOL','RCARTS','RCCORE','RCHUMS','RCIDIV','RCLANG','RCNSCI','RCSSCI','REEES','RELIGION','ROMLANG','ROMLING','RUSSIAN','SAC','SAS','SCAND','SEAS','SI','SLAVIC','SOC','SPANISH','STATS','STDABRD','SWC','TCHNCLCM','THEORY','THTREMUS','UC','UKRAINE','UP','WOMENSTD','YIDDISH'); - - $basepath = "http://www.lsa.umich.edu/cg/cg_results.aspx"; - $yearsyn = 1800 + $year; // Weird year synonym name where 2000 == 1800 - $basepath .= "?termArray={$season}_{$year}_${yearsyn}&cgtype=ug"; - $season = strtolower($season); - $tables = array(); - foreach($departments as $department) { - $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000'); - } - return $tables; -} -