@techreport{singhal95length, author = {Amit Singhal and Gerard Salton and Chris Buckley}, title = {Length Normalization in Degraded Text Collections}, year = {1995}, institution = {Cornell University}, address = {Ithaca, NY, USA}, } @inproceedings{bast05spectral, author = {Holger Bast and Debapriyo Majumdar}, title = {Why spectral retrieval works}, booktitle = {SIGIR '05: Proceedings of the 28th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2005}, isbn = {1-59593-034-5}, pages = {11--18}, location = {Salvador, Brazil}, doi = {http://doi.acm.org/10.1145/1076034.1076040}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{moffat95insitu, author = {Alistair Moffat and Timothy A. H. Bell}, title = {In situ generation of compressed inverted files}, journal = {J. Am. Soc. Inf. Sci.}, volume = {46}, number = {7}, year = {1995}, issn = {0002-8231}, pages = {537--550}, publisher = {John Wiley \& Sons, Inc.}, address = {New York, NY, USA}, } @inproceedings{zukowski06superscalar, author = {Marcin Zukowski and Sandor Heman and Niels Nes and Peter Boncz}, title = {Super-Scalar RAM-CPU Cache Compression}, booktitle = {ICDE '06: Proceedings of the 22nd International Conference on Data Engineering (ICDE'06)}, year = {2006}, isbn = {0-7695-2570-9}, pages = {59}, doi = {http://dx.doi.org/10.1109/ICDE.2006.150}, publisher = {IEEE Computer Society}, address = {Washington, DC, USA}, } @inproceedings{dom02information, author = {Byron E. Dom}, title = {An Information-Theoretic External Cluster-Validity Measure}, booktitle = {Proceedings of the 18th Conference on Uncertainty in Artificial Intelligence (UAI-2002)}, month = {August}, year = {2002} } @inproceedings{blanco07boosting, title = {Boosting Static Pruning of Inverted Files}, author = {Roi Blanco and Alvaro Barreiro}, year = 2007, booktitle = {SIGIR} } @article{weiss05concept, author = "Stanis{\l}aw Osi{\'n}ski and Dawid Weiss", title = "{A Concept-Driven Algorithm for Clustering Search Results}", journal = "IEEE Intelligent Systems", number = "3", volume = "20", pages = "48--54", year = "2005", } @inproceedings{arthur06worstcase, author = "David Arthur and Sergei Vassilvitskii", title = "{On the Worst Case Complexity of the k-Means Method}", booktitle = "Proceedings of the 22nd Annual ACM Symposium on Computational Geometry, Sedona, Arizona", year = "2006", note = "(awaiting publication)", } @book{fellbaum98wordnet, title = {WordNet -- An Electronic Lexical Database}, author = {Christiane D. Fellbaum}, publisher = {MIT Press}, year = {1998} } @article{fowlkes83clusterings, AUTHOR = "Edward B. Fowlkes and Colin L. Mallows", TITLE = "A Method for Comparing Two Hierarchical Clusterings", JOURNAL = "ADAJ", VOLUME = "78", YEAR = "1983", NUMBER = "383", PAGES = "553-569" } @inproceedings {kleinberg02impossibility, title = {An Impossibility Theorem for Clustering}, author = {Jon M. Kleinberg}, year = {2002}, booktitle = {NIPS} } @InProceedings{meila05clusterings, author = "Marina Meil\u{a}", title = "Comparing clusterings -- {A}n axiomatic view", booktitle = "Proceedings of the 22nd International Conference on Machine Learning", year = "2005", address = "Bonn, Germany", pages = "", } @article{savaresi04pddp, author = {Sergio M. Savaresi and Daniel Boley}, title = {A comparative analysis on the bisecting K-means and the PDDP clustering algorithms.}, journal = {Intell. Data Anal.}, volume = {8}, number = {4}, year = {2004}, pages = {345-362} } @article{castro04likelihood, author = {R. M. Castro and M. J. Coates and R. D. Nowak}, title = {Likelihood Based Hierarchical Clustering}, journal = {IEEE Transactions in Signal Processing}, volume=52, number=8, year = {2004}, pages = {2308--2321} } @inproceedings{kamvar02interpreting, author = {Sepandar D. Kamvar and Dan Klein and Christopher D. Manning}, title = {Interpreting and Extending Classical Agglomerative Clustering Algorithms using a Model-Based approach}, booktitle = {ICML '02: Proceedings of the Nineteenth International Conference on Machine Learning}, year = {2002}, isbn = {1-55860-873-7}, pages = {283--290}, publisher = {Morgan Kaufmann Publishers Inc.}, address = {San Francisco, CA, USA}, } @book{mclachlan96em, title = {The {EM} Algorithm and Extensions}, author = {Geoffrey J. McLachlan and Thiriyambakam Krishnan}, year = 1996, publisher = {John Wiley \& Sons} } @article{blei03latent, author = {David M. Blei and Andrew Y. Ng and Michael I. Jordan}, title = {Latent dirichlet allocation}, journal = {J. Mach. Learn. Res.}, volume = {3}, year = {2003}, issn = {1533-7928}, pages = {993--1022}, publisher = {MIT Press}, address = {Cambridge, MA, USA}, } @BOOK{rice06statistics, AUTHOR = {John A. Rice}, TITLE = {Mathematical Statistics and Data Analysis}, PUBLISHER = {Duxbury Press}, YEAR = 2006 } @BOOK{sheldon06probability, AUTHOR = {Sheldon Ross}, TITLE = {A First Course in Probability}, PUBLISHER = {Pearson Prentice Hall}, YEAR = 2006 } @inproceedings{buttcher06document, author = {Stefan B\"{u}ttcher and Charles L. A. Clarke}, title = {A document-centric approach to static index pruning in text retrieval systems}, booktitle = {CIKM '06: Proceedings of the 15th ACM international conference on Information and knowledge management}, year = {2006}, isbn = {1-59593-433-2}, pages = {182--189}, location = {Arlington, Virginia, USA}, doi = {http://doi.acm.org/10.1145/1183614.1183644}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{trotman03compressing, author = {Andrew Trotman}, title = {Compressing Inverted Files}, journal = {Inf. Retr.}, volume = {6}, number = {1}, year = {2003}, issn = {1386-4564}, pages = {5--19}, doi = {http://dx.doi.org/10.1023/A:1022949613039}, publisher = {Kluwer Academic Publishers}, address = {Hingham, MA, USA}, } @book{cover91elements, author = {Thomas M. Cover and Joy A. Thomas}, title = {Elements of Information Theory}, publisher = {Wiley}, year = {1991}, address = {New York} } @article{barroso03web, author = {Luiz Andr\'{e} Barroso and Jeffrey Dean and Urs H\"{o}lzle}, title = {Web Search for a Planet: The {G}oogle Cluster Architecture}, journal = {IEEE Micro}, volume = {23}, number = {2}, year = {2003}, issn = {0272-1732}, pages = {22--28}, doi = {http://dx.doi.org/10.1109/MM.2003.1196112}, publisher = {IEEE Computer Society Press}, address = {Los Alamitos, CA, USA}, } @book{comtet74advanced, author = {Louis Comtet}, publisher = {Reidel}, title = {Advanced Combinatorics}, year = {1974} } @inproceedings{ball65data, author = {G. H. Ball}, title = {Data analysis in the social sciences: {W}hat about the details?}, booktitle = {Proceedings of the Fall Joint Computer Conference}, publisher = {Spartan Books}, pages = {533--560}, year = 1965 } @book{burnham02model, author = {Kenneth P. Burnham and David Anderson }, citeulike-article-id = {157697}, isbn = {0387953647}, publisher = {Springer}, title = {Model Selection and Multi-Model Inference}, year = {2002} } @article{hartigan79kmeans, author={J. A. Hartigan and M. A. Wong}, title={A {K}-Means Clustering Algorithm}, journal={Applied Statistics}, volume=28, pages={100--108}, entrydate=20030618, key={Hartigan/Wong:79}, year=1979, } % the original only has initials, no full first names @inproceedings{basu04active, title = {Active Semi-Supervision for Pairwise Constrained Clustering}, address = {Lake Buena Vista, FL}, author = {Sugato Basu and Arindam Banerjee and Raymond J. Mooney}, booktitle = {Proceedings of the SIAM International Conference on Data Mining}, pages = {333--344}, year = {2004} } @article{godoy06modeling, author = {Daniela Godoy and Anal\'{i}a Amandi}, title = {Modeling user interests by conceptual clustering}, journal = {Inf. Syst.}, volume = {31}, number = {4}, year = {2006}, issn = {0306-4379}, pages = {247--265}, doi = {http://dx.doi.org/10.1016/j.is.2005.02.008}, publisher = {Elsevier Science Ltd.}, address = {Oxford, UK, UK}, } @inproceedings{huang06text, author = {Yifen Huang and Tom M. Mitchell}, title = {Text clustering with extended user feedback}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {413--420}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148242}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{weiss03web, editor = "Mieczys{\l{}}aw A. K{\l{}}opotek and S{\l{}}awomir T. Wierzcho{\'{n}} and Krzysztof Trojanowski", author = "Dawid Weiss and Jerzy Stefanowski", title = "Web Search Results Clustering in {P}olish: {E}xperimental evaluation of {C}arrot", booktitle = {New Trends in Intelligent Information Processing and Web Mining Conference}, year = "2003" } @techreport{dom01informationtheoretic, author = {Byron E. Dom}, title = {An Information-Theoretic External Cluster-Validity Measure}, booktitle = {UAI}, year = {2001}, institution = {IBM}, number = {RJ 10219} } @inproceedings{dom02informationtheoretic, author = {Byron Dom}, title = {An Information-Theoretic External Cluster-Validity Measure.}, booktitle = {UAI}, year = {2002}, pages = {137-145}, crossref = {DBLP:conf/uai/2002}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{crouch88cluster, author = {Carolyn J. Crouch}, title = {A cluster-based approach to thesaurus construction}, booktitle = {SIGIR '88: Proceedings of the 11th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1988}, isbn = {2-7061-0309-4}, pages = {309--320}, location = {Grenoble, France}, doi = {http://doi.acm.org/10.1145/62437.62467}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{schuetze95information, author={Hinrich Sch{\"u}tze and Jan O. Pedersen}, title={Information Retrieval Based on Word Senses}, year = 1995, booktitle={Fourth Annual Symposium on Document Analysis and Information Retrieval}, address={Las Vegas, NV}, pages = {161--175} } @book{witten05data, title = {Data Mining: Practical Machine Learning Tools and Techniques}, author = {Ian H. Witten and Eibe Frank}, edition = {Second}, howpublished = {Paperback}, month = {June}, publisher = {Morgan Kaufmann}, series = {Morgan Kaufmann Series in Data Management Sys}, year = {2005}, isbn = {0120884070}, citeulike-article-id = {340715}, priority = {0}, keywords = {weka data mining da } } @incollection{cheeseman96bayesian, author = {Peter Cheeseman and John Stutz}, title = {Bayesian Classification (AutoClass): Theory and Results.}, booktitle = {Advances in Knowledge Discovery and Data Mining}, year = {1996}, pages = {153-180}, publisher = {MIT Press}, bibsource = {DBLP, http://dblp.uni-trier.de} } @unpublished{mccallum96bow, author = "Andrew Kachites McCallum", title = {Bow: A toolkit for statistical language modeling, text retrieval, classification and clustering}, note = "\url{http://www.cs.cmu.edu/~mccallum/bow}", year = 1996} @inproceedings{picca06nonlinear, author = {Davide Picca and Beno\^{i}t Curdy and Fran\c{c}ois Bavaud}, title = {Non-linear correspondence analysis in text retrieval: a kernel view}, booktitle = {Proceedings of JADT}, year = {2006} } @article{cambazoglu06performance, author = {{Berkant Barla} Cambazoglu and Cevdet Aykanat}, title = {Performance of query processing implementations in ranking-based text retrieval systems using inverted indices.}, journal = {Inf. Process. Manage.}, volume = {42}, number = {4}, year = {2006}, pages = {875-898}, ee = {http://dx.doi.org/10.1016/j.ipm.2005.06.004}, bibsource = {DBLP, http://dblp.uni-trier.de} } @book{bishop06pattern, author = {Christopher M. Bishop}, title = { Pattern Recognition and Machine Learning}, publisher = {Springer}, year = 2006 } @inproceedings{ghamrawi05collective, author = {Nadia Ghamrawi and Andrew McCallum}, title = {Collective multi-label classification}, booktitle = {CIKM '05: Proceedings of the 14th ACM international conference on Information and knowledge management}, year = {2005}, isbn = {1-59593-140-6}, pages = {195--200}, location = {Bremen, Germany}, doi = {http://doi.acm.org/10.1145/1099554.1099591}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{crammer01pranking, author = {Koby Crammer and Yoram Singer}, title = {Pranking with Ranking.}, booktitle = {NIPS}, year = {2001}, pages = {641-647}, ee = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA65.ps.gz}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{geman92neural, author = {Stuart Geman and Elie Bienenstock and Ren{\'e} Doursat}, title = {Neural networks and the bias/variance dilemma}, journal = {Neural Comput.}, volume = {4}, number = {1}, year = {1992}, issn = {0899-7667}, pages = {1--58}, publisher = {MIT Press}, address = {Cambridge, MA, USA}, } @inproceedings{bennett99densitybased, author = {Kristin P. Bennett and Usama Fayyad and Dan Geiger}, title = {Density-based indexing for approximate nearest-neighbor queries}, booktitle = {KDD '99: Proceedings of the fifth ACM SIGKDD international conference on Knowledge discovery and data mining}, year = {1999}, isbn = {1-58113-143-7}, pages = {233--243}, location = {San Diego, California, United States}, doi = {http://doi.acm.org/10.1145/312129.312236}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{fang04formal, author = {Hui Fang and Tao Tao and ChengXiang Zhai}, title = {A formal study of information retrieval heuristics}, booktitle = {SIGIR '04: Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2004}, isbn = {1-58113-881-4}, pages = {49--56}, location = {Sheffield, United Kingdom}, doi = {http://doi.acm.org/10.1145/1008992.1009004}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{buckley85optimization, author = {Chris Buckley and Alan F. Lewit}, title = {Optimization of inverted vector searches}, booktitle = {SIGIR '85: Proceedings of the 8th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1985}, isbn = {0-89791-159-8}, pages = {97--110}, location = {Montreal, Quebec, Canada}, doi = {http://doi.acm.org/10.1145/253495.253515}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{anagnostopoulos06effective, author = {Aris Anagnostopoulos and Andrei Z. Broder and Kunal Punera}, title = {Effective and efficient classification on a search-engine model}, booktitle = {CIKM '06: Proceedings of the 15th ACM international conference on Information and knowledge management}, year = {2006}, isbn = {1-59593-433-2}, pages = {208--217}, location = {Arlington, Virginia, USA}, doi = {http://doi.acm.org/10.1145/1183614.1183648}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{guttman84rtrees, author = {Antonin Guttman}, title = {R-trees: a dynamic index structure for spatial searching}, booktitle = {SIGMOD '84: Proceedings of the 1984 ACM SIGMOD international conference on Management of data}, year = {1984}, isbn = {0-89791-128-8}, pages = {47--57}, location = {Boston, Massachusetts}, doi = {http://doi.acm.org/10.1145/602259.602266}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{ rahm01survey, author = "Erhard Rahm and Philip A. Bernstein", title = "A survey of approaches to automatic schema matching", journal = "VLDB Journal: Very Large Data Bases", volume = "10", number = "4", month = "????", pages = "334--350", year = "2001", url = "citeseer.ist.psu.edu/rahm01survey.html" } @inproceedings{hatzivassiloglou00linguistic, author = {Vasileios Hatzivassiloglou and Luis Gravano and Ankineedu Maganti}, title = {An investigation of linguistic features and clustering algorithms for topical document clustering}, booktitle = {SIGIR '00: Proceedings of the 23rd annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2000}, isbn = {1-58113-226-3}, pages = {224--231}, location = {Athens, Greece}, doi = {http://doi.acm.org/10.1145/345508.345582}, publisher = {ACM Press}, address = {New York, NY, USA}, } @Article{lance67general, author = "G. N. Lance and W. T. Williams", title = "A general theory of classificatory sorting strategies 1. {Hierarchical} systems", journal = "Computer Journal", volume = "9", number = "4", pages = "373--380", month = feb, year = "1967", CODEN = "CMPJA6", ISSN = "0010-4620" } @article{zhao04empirical, author = {Ying Zhao and George Karypis}, title = {Empirical and Theoretical Comparisons of Selected Criterion Functions for Document Clustering}, journal = {Mach. Learn.}, volume = {55}, number = {3}, year = {2004}, issn = {0885-6125}, pages = {311--331}, doi = {http://dx.doi.org/10.1023/B:MACH.0000027785.44527.d6}, publisher = {Kluwer Academic Publishers}, address = {Hingham, MA, USA}, } @inproceedings{sahoo06incremental, author = {Nachiketa Sahoo and Jamie Callan and Ramayya Krishnan and George Duncan and Rema Padman}, title = {Incremental hierarchical clustering of text documents}, booktitle = {CIKM '06: Proceedings of the 15th ACM international conference on Information and knowledge management}, year = {2006}, isbn = {1-59593-433-2}, pages = {357--366}, location = {Arlington, Virginia, USA}, doi = {http://doi.acm.org/10.1145/1183614.1183667}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{larsen99fast, author = {Bjornar Larsen and Chinatsu Aone}, title = {Fast and effective text mining using linear-time document clustering}, booktitle = {KDD '99: Proceedings of the fifth ACM SIGKDD international conference on Knowledge discovery and data mining}, year = {1999}, isbn = {1-58113-143-7}, pages = {16--22}, location = {San Diego, California, United States}, doi = {http://doi.acm.org/10.1145/312129.312186}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{zhao02evaluation, author = {Ying Zhao and George Karypis}, title = {Evaluation of hierarchical clustering algorithms for document datasets}, booktitle = {CIKM '02: Proceedings of the eleventh international conference on Information and knowledge management}, year = {2002}, isbn = {1-58113-492-4}, pages = {515--524}, location = {McLean, Virginia, USA}, doi = {http://doi.acm.org/10.1145/584792.584877}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{buttcher05indexing, author = {Stefan B\"{u}ttcher and Charles L. A. Clarke}, title = {Indexing time vs. query time: trade-offs in dynamic information retrieval systems}, booktitle = {CIKM '05: Proceedings of the 14th ACM international conference on Information and knowledge management}, year = {2005}, isbn = {1-59593-140-6}, pages = {317--318}, location = {Bremen, Germany}, doi = {http://doi.acm.org/10.1145/1099554.1099645}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{forman06tackling, author = {George Forman}, title = {Tackling concept drift by temporal inductive transfer}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {252--259}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148216}, publisher = {ACM Press}, address = {New York, NY, USA}, } @ARTICLE {brisaboa06lightweight, TITLE = "Lightweight Natural Language Text Compression", AUTHOR = "Nieves R. Brisaboa and Antonio Fari\~{n}a and Gonzalo Navarro and Jos\'{e} R. Param\'{a}", JOURNAL = {Information Retrieval}, YEAR = 2007, volume = 10, number = 1, pages = {1--33} } % http://www.springerlink.com/content/f2019344p476r022/ @inproceedings{buttcher05security, author = {Stefan B{\"u}ttcher and Charles L. A. Clarke}, title = {A Security Model for Full-Text File System Search in Multi-User Environments.}, booktitle = {FAST}, year = {2005}, ee = {http://www.usenix.org/events/fast05/tech/buettcher.html} } @book{heaps78information, author = {Heaps, Harold S.}, year = {1978}, title = {Information Retrieval: Computational and Theoretical Aspects}, publisher = {Academic Press}, address = {New York} } @article{anh06improved, author = {Vo Ngoc Anh and Alistair Moffat}, title = "Improved Word-Aligned Binary Compression for Text Indexing", journal = "IEEE Transactions on Knowledge and Data Engineering", year = 2006, volume = 18, number = 6, pages = "857-861" } @InProceedings{buckley94automatic, Author = "Chris Buckley and James Allan and Gerard Salton", Title = "Automatic Routing and Ad-hoc Retrieval using SMART: TREC 2", Booktitle = "Proc.\ of the 2nd Text Retrieval Conference (TREC-2)", Pages = "45--55", Year = 1994 } @inproceedings{schapire98boosting, author={Robert E. Schapire and Yoram Singer and Amit Singhal}, title={Boosting and {R}occhio Applied to Text Filtering}, year=1998, pages = {215--223}, booktitle={SIGIR '98}, } @inproceedings{hoch94using, author = {Rainer Hoch}, title = {Using IR techniques for text classification in document analysis}, booktitle = {SIGIR '94: Proceedings of the 17th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1994}, isbn = {0-387-19889-X}, pages = {31--40}, location = {Dublin, Ireland}, publisher = {Springer-Verlag New York, Inc.}, address = {New York, NY, USA}, } @article{fuernkranz02round, author = {Johannes F\"{u}rnkranz}, title = {Round robin classification}, journal = {J. Mach. Learn. Res.}, volume = {2}, year = {2002}, issn = {1533-7928}, pages = {721--747}, publisher = {MIT Press}, address = {Cambridge, MA, USA}, } @inProceedings{ittner95text, author = {David J. Ittner and David D. Lewis and David D. Ahn}, title = {Text categorization of low quality images}, booktitle = {Proceedings of SDAIR-95, 4th Annual Symposium on Document Analysis and Information Retrieval}, publisher = {}, editor = {}, year = {1995}, address = {Las Vegas, US}, pages = {301--315} } @article{bharat98personalized, author = {Krishna Bharat and Tomonari Kamba and Michael Albers}, title = {Personalized, Interactive News on the Web}, journal = {Multimedia Syst.}, volume = {6}, number = {5}, year = {1998}, pages = {349-358}, ee = {http://link.springer.de/link/service/journals/00530/bibs/8006005/80060349.htm} } @inproceedings{alonso06gio, author = {Omar Alonso and Sandeepan Banerjee and Mark Drake}, title = {GIO: a semantic web application using the information grid framework}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, year = {2006}, isbn = {1-59593-323-9}, pages = {857--858}, location = {Edinburgh, Scotland}, doi = {http://doi.acm.org/10.1145/1135777.1135913}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{toda06search, author = {Hiroyuki Toda and Ryoji Kataoka}, title = {A search result clustering method using informatively named entities}, booktitle = {WIDM '05: Proceedings of the 7th annual ACM international workshop on Web information and data management}, year = {2005}, isbn = {1-59593-194-5}, pages = {81--86}, location = {Bremen, Germany}, doi = {http://doi.acm.org/10.1145/1097047.1097063}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{ogilvie05parameter, author = {Paul Ogilvie and Jamie Callan}, title = {Parameter Estimation for a Simple Hierarchical Generative Model for XML Retrieval.}, booktitle = {INEX}, year = {2005}, pages = {211-224}, ee = {http://dx.doi.org/10.1007/11766278_16}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{raghavan89critical, author = {Vijay Raghavan and Peter Bollmann and Gwang S. Jung}, title = {A critical investigation of recall and precision as measures of retrieval system performance}, journal = {ACM Trans. Inf. Syst.}, volume = {7}, number = {3}, year = {1989}, issn = {1046-8188}, pages = {205--229}, doi = {http://doi.acm.org/10.1145/65943.65945}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{witten90source, author = {Ian H. Witten and T. C. Bell}, title = {Source models for natural language text}, journal = {Int. J. Man-Mach. Stud.}, volume = {32}, number = {5}, year = {1990}, issn = {0020-7373}, pages = {545--579}, publisher = {Academic Press Ltd.}, address = {London, UK, UK}, } @inproceedings{kleinberg97two, author = {Jon M. Kleinberg}, title = {Two algorithms for nearest-neighbor search in high dimensions}, booktitle = {STOC '97: Proceedings of the twenty-ninth annual ACM symposium on Theory of computing}, year = {1997}, isbn = {0-89791-888-6}, pages = {599--608}, location = {El Paso, Texas, United States}, doi = {http://doi.acm.org/10.1145/258533.258653}, publisher = {ACM Press}, address = {New York, NY, USA}, } @incollection{anh06structured, title = {Structured Index Organizations for High-Throughput Text Querying}, booktitle = {Proc. 13th Int. Symp. String Processing and Information Retrieval}, series = {Lecture Notes in Computer Science}, author = {Vo Ngoc Anh and Alistair Moffat}, publisher = {Springer}, pages={304-315}, year =2006, volume =4209 } @inproceedings{koenemann96interaction, author = {J\"{u}rgen Koenemann and Nicholas J. Belkin}, title = {A case for interaction: a study of interactive information retrieval behavior and effectiveness}, booktitle = {CHI '96: Proceedings of the SIGCHI conference on Human factors in computing systems}, year = {1996}, isbn = {0-89791-777-4}, pages = {205--212}, location = {Vancouver, British Columbia, Canada}, doi = {http://doi.acm.org/10.1145/238386.238487}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{dieugenio04kappa, author = {Barbara {Di Eugenio} and Michael Glass}, title = {The Kappa Statistic: A Second Look.}, journal = {Computational Linguistics}, volume = {30}, number = {1}, year = {2004}, pages = {95-101}, ee = {http://dx.doi.org/10.1162/089120104773633402}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{singitham04efficiency, author = {Pavan Kumar C. Singitham and Mahathi S. Mahabhashyam and Prabhakar Raghavan}, title = {Efficiency-Quality Tradeoffs for Vector Score Aggregation.}, booktitle = {VLDB}, year = {2004}, pages = {624-635}, ee = {http://www.vldb.org/conf/2004/RS17P1.PDF}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{buttcher06hybrid, author = {Stefan B\"{u}ttcher and Charles L. A. Clarke and Brad Lushman}, title = {Hybrid index maintenance for growing text collections}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {356--363}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148233}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{heinz02burst, author = {Steffen Heinz and Justin Zobel and Hugh E. Williams}, title = {Burst tries: a fast, efficient data structure for string keys}, journal = {ACM Trans. Inf. Syst.}, volume = {20}, number = {2}, year = {2002}, issn = {1046-8188}, pages = {192--223}, doi = {http://doi.acm.org/10.1145/506309.506312}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{ribeiro99efficient, author = {Berthier Ribeiro-Neto and Edleno S. Moura and Marden S. Neubert and Nivio Ziviani}, title = {Efficient distributed algorithms to build inverted files}, booktitle = {SIGIR '99: Proceedings of the 22nd annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1999}, isbn = {1-58113-096-1}, pages = {105--112}, location = {Berkeley, California, United States}, doi = {http://doi.acm.org/10.1145/312624.312663}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{melnik01building, author = {Sergey Melnik and Sriram Raghavan and Beverly Yang and Hector Garcia-Molina}, title = {Building a distributed full-text index for the Web}, booktitle = {WWW '01: Proceedings of the 10th international conference on World Wide Web}, year = {2001}, isbn = {1-58113-348-0}, pages = {396--406}, location = {Hong Kong, Hong Kong}, doi = {http://doi.acm.org/10.1145/371920.372095}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{lester06efficient, author = {Nicholas Lester and Justin Zobel and Hugh E. Williams}, title = {Efficient online index maintenance for contiguous inverted lists.}, journal = {Information Processing \& Management}, volume = {42}, number = {4}, year = {2006}, pages = {916--933}, ee = {http://dx.doi.org/10.1016/j.ipm.2005.09.005}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{williams05searchable, author = {Hugh E. Williams and Justin Zobel}, title = {Searchable words on the Web}, journal = {Int. J. on Digital Libraries}, volume = {5}, number = {2}, year = {2005}, pages = {99-105}, ee = {http://dx.doi.org/10.1007/s00799-003-0050-z}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{heinz03efficient, author = {Steffen Heinz and Justin Zobel}, title = {Efficient single-pass index construction for text databases}, journal = {J. Am. Soc. Inf. Sci. Technol.}, volume = {54}, number = {8}, year = {2003}, issn = {1532-2882}, pages = {713--729}, doi = {http://dx.doi.org/10.1002/asi.10268}, publisher = {John Wiley \& Sons, Inc.}, address = {New York, NY, USA}, } @article{zobel01inmemory, author = {Justin Zobel and Steffen Heinz and Hugh E. Williams}, title = {In-memory hash tables for accumulating text vocabularies}, journal = {Inf. Process. Lett.}, volume = {80}, number = {6}, year = {2001}, issn = {0020-0190}, pages = {271--277}, doi = {http://dx.doi.org/10.1016/S0020-0190(01)00239-3}, publisher = {Elsevier North-Holland, Inc.}, address = {Amsterdam, The Netherlands, The Netherlands}, } @inproceedings{lester05fast, author = {Nicholas Lester and Alistair Moffat and Justin Zobel}, title = {Fast on-line index construction by geometric partitioning}, booktitle = {CIKM '05: Proceedings of the 14th ACM international conference on Information and knowledge management}, year = {2005}, isbn = {1-59593-140-6}, pages = {776--783}, location = {Bremen, Germany}, doi = {http://doi.acm.org/10.1145/1099554.1099739}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{treeratpituk06experimental, author = {Pucktada Treeratpituk and Jamie Callan}, title = {An experimental study on automatically labeling hierarchical clusters using statistical features}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {707--708}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148328}, publisher = {ACM Press}, address = {New York, NY, USA}, } @Manual{r05r, title = {R: A language and environment for statistical computing}, author = {{R Development Core Team}}, organization = {R Foundation for Statistical Computing}, address = {Vienna, Austria}, year = {2005}, note = {{ISBN} 3-900051-07-0}, url = {http://www.R-project.org}, } @article{tombros02effectiveness, author = {Anastasios Tombros and Robert Villa and C. J. Van Rijsbergen}, title = {The effectiveness of query-specific hierarchic clustering in information retrieval}, journal = {Inf. Process. Manage.}, volume = {38}, number = {4}, year = {2002}, issn = {0306-4573}, pages = {559--582}, doi = {http://dx.doi.org/10.1016/S0306-4573(01)00048-6}, publisher = {Pergamon Press, Inc.}, address = {Tarrytown, NY, USA}, } @article{schwarz78estimating, author = "Gideon Schwarz", title = "Estimating the dimension of a model", journal = "Annals of Statistics", year = {1978}, volume = {6}, number =2, pages = {461--464} } @inproceedings{pelleg00xmeans, Year = {2000}, Pages = {727-734}, Publisher = {Morgan Kaufmann}, Address = {San Francisco}, Booktitle = {Proceedings of the Seventeenth International Conference on Machine Learning}, Author = {Dan Pelleg and Andrew Moore}, Title = {X-means: Extending K-means with Efficient Estimation of the Number of Clusters} } @article{akaike74new, author = "Hirotugu Akaike", title = "A new look at the statistical model identification", journal = "{IEEE} Transactions on automatic control", year = {1974}, volume = {19}, number=6, pages = {716-723} } @article{tibshirani01estimating, author = { Robert Tibshirani and Guenther Walther and Trevor Hastie}, title = {Estimating the number of clusters in a data set via the gap statistic}, journal = {J. Roy. Statist. Soc. Ser. B}, volume = 63, year =2001, pages = { 411--423} } @inproceedings{bradley98scaling, author = {Paul S. Bradley and Usama M. Fayyad and Cory Reina}, title = {Scaling Clustering Algorithms to Large Databases.}, booktitle = {KDD}, year = {1998}, pages = {9-15}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{fayyad98initialization, author = {Usama M. Fayyad and Cory Reina and Paul S. Bradley}, title = {Initialization of Iterative Refinement Clustering Algorithms.}, booktitle = {KDD}, year = {1998}, pages = {194-198}, bibsource = {DBLP, http://dblp.uni-trier.de} } @article{macqueen67some, author = "J.~MacQueen", title ="Some methods for classification and analysis of multivariate observations", journal ="Proceedings of the Fifth Berkeley Symposium on Mathematics, Statistics and Probability", volume =1, pages ={281--297}, year ="1967", publisher = {University of California Press} } @article{lloyd82least, author = {Stuart P. Lloyd}, title = {Least squares quantization in {PCM}}, journal = {IEEE Transactions on Information Theory}, volume = {28}, number = {2}, year = {1982}, pages = {129-136}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{ji06document, author = {Xiang Ji and Wei Xu}, title = {Document clustering with prior knowledge}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {405--412}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148241}, publisher = {ACM Press}, address = {New York, NY, USA}, } @PHDTHESIS{strehl02relationship, AUTHOR = {Alexander Strehl}, TITLE = {Relationship-based Clustering and Cluster Ensembles for High-dimensional Data Mining}, YEAR = {2002}, MONTH = {May}, SCHOOL = {The University of Texas at Austin} } @inproceedings{yang06near, author = {Hui Yang and Jamie Callan}, title = {Near-duplicate detection by instance-level constrained clustering}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {421--428}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148243}, publisher = {ACM Press}, address = {New York, NY, USA}, } @book{salton75dynamic, author = {Gerard Salton}, title = {Dynamic information and library processing}, year = {1975}, isbn = {0132213257}, publisher = {Prentice-Hall, Inc.}, address = {Upper Saddle River, NJ, USA}, } @inproceedings{liu04cluster, author = {Xiaoyong Liu and W. Bruce Croft}, title = {Cluster-based retrieval using language models}, booktitle = {SIGIR '04: Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2004}, isbn = {1-58113-881-4}, pages = {186--193}, location = {Sheffield, United Kingdom}, doi = {http://doi.acm.org/10.1145/1008992.1009026}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{hearst06clustering, author = {Marti A. Hearst}, title = {Clustering versus faceted categories for information exploration}, journal = {Commun. ACM}, volume = {49}, number = {4}, year = {2006}, issn = {0001-0782}, pages = {59--61}, doi = {http://doi.acm.org/10.1145/1121949.1121983}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{zamir99grouper, author = {Oren Zamir and Oren Etzioni}, title = {Grouper: a dynamic clustering interface to Web search results}, booktitle = {WWW '99: Proceeding of the eighth international conference on World Wide Web}, year = {1999}, pages = {1361--1374}, location = {Toronto, Canada}, doi = {http://dx.doi.org/10.1016/S1389-1286(99)00054-7}, publisher = {Elsevier North-Holland, Inc.}, address = {New York, NY, USA}, } @article{hubert85comparing, author = {Lawrence Hubert and Phipps Arabie}, journal = {Journal of Classification}, pages = {193--218}, title = {Comparing partitions}, volume = {2}, year = {1985} } @article{rand71objective, author = {William M. Rand}, journal = {Journal of the American Statistical Association}, pages = {846--850}, title = {Objective criteria for the evaluation of clustering methods}, volume = {66}, number = 336, year = {1971} } @inproceedings{hamerly03kmeans, author = {Greg Hamerly and Charles Elkan}, title = {Learning the k in k-means.}, booktitle = {NIPS}, year = {2003}, ee = {http://books.nips.cc/papers/files/nips16/NIPS2003_AA36.pdf}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{vaithyanathan00modelbased, author = {Shivakumar Vaithyanathan and Byron Dom}, title = {Model-Based Hierarchical Clustering}, booktitle = {UAI '00: Proceedings of the 16th Conference on Uncertainty in Artificial Intelligence}, year = {2000}, isbn = {1-55860-709-9}, pages = {599--608}, publisher = {Morgan Kaufmann Publishers Inc.}, address = {San Francisco, CA, USA}, } @inproceedings{lewis96training, author = {David D. Lewis and Robert E. Schapire and James P. Callan and Ron Papka}, title = {Training algorithms for linear text classifiers}, booktitle = {SIGIR '96: Proceedings of the 19th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1996}, isbn = {0-89791-792-8}, pages = {298--306}, location = {Zurich, Switzerland}, doi = {http://doi.acm.org/10.1145/243199.243277}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{dietterich95multiclass, author = {Thomas G. Dietterich and Ghulum Bakiri}, title = {Solving Multiclass Learning Problems via Error-Correcting Output Codes.}, journal = {J. Artif. Intell. Res. (JAIR)}, volume = {2}, year = {1995}, pages = {263-286}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{kaki05findex, author = {Mika K\"{a}ki}, title = {Findex: search result categories help users when document ranking fails}, booktitle = {CHI '05: Proceedings of the SIGCHI conference on Human factors in computing systems, Portland, Oregon, USA}, year = {2005}, isbn = {1-58113-998-5}, pages = {131--140}, doi = {http://doi.acm.org/10.1145/1054972.1054991}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{lee02high, author = {Dong-Ho Lee and Hyoung-Joo Kim}, title = {An efficient nearest neighbor search in high-dimensional data spaces.}, journal = {Inf. Process. Lett.}, volume = {81}, number = {5}, year = {2002}, pages = {239-246}, ee = {http://dx.doi.org/10.1016/S0020-0190(01)00236-8}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{fradkin03experiments, author = {Dmitriy Fradkin and David Madigan}, title = {Experiments with random projections for machine learning}, booktitle = {KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining}, year = {2003}, isbn = {1-58113-737-0}, pages = {517--522}, location = {Washington, D.C.}, doi = {http://doi.acm.org/10.1145/956750.956812}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{allwein00reducing, author = {Erin L. Allwein and Robert E. Schapire and Yoram Singer}, title = {Reducing Multiclass to Binary: A Unifying Approach for Margin Classifiers.}, journal = {Journal of Machine Learning Research}, volume = {1}, year = {2000}, pages = {113-141}, ee = {http://www.ai.mit.edu/projects/jmlr/papers/volume1/allwein00a/abstract.html}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{lewis98naive, author = {David D. Lewis}, title = {Naive (Bayes) at Forty: The Independence Assumption in Information Retrieval}, booktitle = {ECML '98: Proceedings of the 10th European Conference on Machine Learning}, year = {1998}, isbn = {3-540-64417-2}, pages = {4--15}, publisher = {Springer-Verlag}, address = {London, UK}, } @inproceedings{ng01discriminative, author = {Andrew Y. Ng and Michael I. Jordan}, title = {On Discriminative vs. Generative Classifiers: A comparison of logistic regression and naive Bayes.}, booktitle = {NIPS}, year = {2001}, pages = {841-848}, ee = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/AA28.ps.gz}, bibsource = {DBLP, http://dblp.uni-trier.de} } % crossref = {DBLP:conf/nips/2001}, @book{snedecor89, title = {Statistical methods}, author = {George Waddel Snedecor and William G. Cochran}, year = 1989, publisher = {Iowa State University Press} } %address = {Ames}, %note = {8th edition} @book{harold04xml, author = {Elliotte Rusty Harold and Scott W. Means}, howpublished = {Paperback}, isbn = {0596007647}, keywords = {xml}, month = {October}, publisher = {{O'Reilly Media, Inc.}}, title = {XML in a Nutshell, Third Edition}, year = {2004} } @inproceedings{mass02juruxml, author = {Yosi Mass and Matan Mandelbrod and Einat Amitay and David Carmel and Yo{\"e}lle S. Maarek and Aya Soffer}, title = {{JuruXML} -- an {XML} Retrieval System at {INEX'02}}, booktitle = {INEX Workshop}, year = {2002}, pages = {73-80}, crossref = {fuhr03inex}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{kamps02morphological, author = {Jaap Kamps and Maarten Marx and Maarten de Rijke and B{\"o}rkur Sigurbj{\"o}rnsson}, title = {The Importance of Morphological Normalization for XML Retrieval.}, booktitle = {INEX Workshop}, year = {2002}, pages = {41-48}, crossref = {fuhr03inex}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{yu02xml, author = {Cong Yu and Hong Qi and H. V. Jagadish}, title = {Integration of {IR} into an {XML} Database}, booktitle = {INEX Workshop}, year = {2002}, pages = {162-169}, crossref = {fuhr03inex}, bibsource = {DBLP, http://dblp.uni-trier.de} } @inproceedings{goevert03overview, author={Norbert G{\"o}vert and Gabriella Kazai}, title={Overview of the {INitiative for the Evaluation of XML} retrieval ({INEX}) 2002}, pages={1--17}, crossref = {fuhr03inex}, entrydate=20030226, key={Goevert/Kazai:03}, note = {In \cite{fuhr03inex}} } @inproceedings{carmel03fragments, author = {David Carmel and Yoelle S. Maarek and Matan Mandelbrod and Yosi Mass and Aya Soffer}, title = {Searching {XML} documents via {XML} fragments}, booktitle = {SIGIR '03: Proceedings of the 26th annual international ACM SIGIR conference on Research and development in informaion retrieval}, year = {2003}, isbn = {1-58113-646-3}, pages = {151--158}, location = {Toronto, Canada}, doi = {http://doi.acm.org/10.1145/860435.860464}, publisher = {ACM Press}, address = {New York, NY, USA}, } @proceedings{fuhr03inex, title={{INitiative for the Evaluation of XML} Retrieval ({INEX}). Proceedings of the First {INEX} Workshop. Dagstuhl, Germany, {December} 8--11, 2002}, editor={Norbert Fuhr and Norbert G{\"o}vert and Gabriella Kazai and Mounia Lalmas}, series={ERCIM Workshop Proceedings}, address={Sophia Antipolis, France}, publisher={ERCIM}, booktitle={INitiative for the Evaluation of {XML} Retrieval (INEX). Proceedings of the First INEX Workshop. Dagstuhl, Germany, December 8--11, 2002}, entrydate=20030226, month={March}, year=2003, } @inproceedings{ramirez06small, author = {Georgina Ramirez and Thijs Westerveld and Arjen P. de Vries}, title = {Using small XML elements to support relevance}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {693--694}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148321}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{anh06pruned, author = {Vo Ngoc Anh and Alistair Moffat}, title = {Pruned query evaluation using pre-computed impacts}, booktitle = {SIGIR '06: Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2006}, isbn = {1-59593-369-7}, pages = {372--379}, location = {Seattle, Washington, USA}, doi = {http://doi.acm.org/10.1145/1148170.1148235}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{moura00fast, author = {de Moura, Edleno Silva and Gonzalo Navarro and Nivio Ziviani and Ricardo Baeza-Yates}, title = {Fast and flexible word searching on compressed text}, journal = {ACM Trans. Inf. Syst.}, volume = {18}, number = {2}, year = {2000}, issn = {1046-8188}, pages = {113--139}, doi = {http://doi.acm.org/10.1145/348751.348754}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{persin96filtered, author = {Michael Persin and Justin Zobel and Ron Sacks-Davis}, title = {Filtered document retrieval with frequency-sorted indexes}, journal = {J. Am. Soc. Inf. Sci.}, volume = {47}, number = {10}, year = {1996}, issn = {0002-8231}, pages = {749--764}, doi = {http://dx.doi.org/10.1002/(SICI)1097-4571(199610)47:10<749::AID-ASI3>3.3.CO;2-U}, publisher = {John Wiley \& Sons, Inc.}, address = {New York, NY, USA}, } @inproceedings{scholer02inverted, author = {Falk Scholer and Hugh E. Williams and John Yiannis and Justin Zobel}, title = {Compression of inverted indexes For fast query evaluation}, booktitle = {SIGIR '02: Proceedings of the 25th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2002}, isbn = {1-58113-561-0}, pages = {222--229}, location = {Tampere, Finland}, doi = {http://doi.acm.org/10.1145/564376.564416}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{anh05invertedindex, author = {Vo Ngoc Anh and Alistair Moffat}, title = {Inverted Index Compression Using Word-Aligned Binary Codes}, journal = {Inf. Retr.}, volume = {8}, number = {1}, year = {2005}, issn = {1386-4564}, pages = {151--166}, doi = {http://dx.doi.org/10.1023/B:INRT.0000048490.99518.5c}, publisher = {Kluwer Academic Publishers}, address = {Hingham, MA, USA}, } @article{moffat96selfindexing, author = {Alistair Moffat and Justin Zobel}, title = {Self-indexing inverted files for fast text retrieval}, journal = {ACM Trans. Inf. Syst.}, volume = {14}, number = {4}, year = {1996}, issn = {1046-8188}, pages = {349--379}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{anh01termination, author = {Vo Ngoc Anh and Owen de Kretser and Alistair Moffat}, title = {Vector-space ranking with effective early termination}, booktitle = {SIGIR '01: Proceedings of the 24th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2001}, isbn = {1-58113-331-6}, pages = {35--42}, location = {New Orleans, Louisiana, United States}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{anh02impact, author = {Vo Ngoc Anh and Alistair Moffat}, title = {Impact transformation: Effective and efficient web retrieval}, booktitle = {SIGIR '02: Proceedings of the 25th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {2002}, pages = {3--10}, location = {Tampere, Finland}, publisher = {ACM Press}, address = {New York, NY, USA}, } @Misc{nutch2006, author = {Nutch}, howpublished = "http://wiki.apache.org/nutch", year = 2006 } @inproceedings{dean04mapreduce, author = {Jeffrey Dean and Sanjay Ghemawat}, title = {MapReduce: Simplified Data Processing on Large Clusters}, booktitle = {Sixth Symposium on Operating System Design and Implementation}, address = {San Francisco, CA}, year = 2004 } @ARTICLE{harman90retrieving, AUTHOR = {Donna Harman and Gerald Candela}, TITLE={Retrieving records from a gigabyte of text on a minicomputer using statistical ranking}, journal="Journal of the American Society for Information Science", volume=41, number=8, year=1990, pages="581--589"} @inproceedings{fuhr01xirql, author = {Norbert Fuhr and Kai {Gro\ss johann}}, title = {{XIRQL}: {A} query language for information retrieval in {XML} documents}, booktitle = {Proceedings of the 24th annual international ACM SIGIR Conference on Research and Development in Information Retrieval}, year = {2001}, isbn = {1-58113-331-6}, pages = {172--180}, location = {New Orleans, Louisiana, United States}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{ steinbach00comparison, author = "Michael Steinbach and George Karypis and Vipin Kumar", title = "A comparison of document clustering techniques", booktitle = "KDD Workshop on Text Mining", year = "2000" } @inproceedings{glover02structure, author = {Eric J. Glover and Kostas Tsioutsiouliklis and Steve Lawrence and David M. Pennock and Gary W. Flake}, title = {Using web structure for classifying and describing web pages}, booktitle = {WWW '02: Proceedings of the 11th international conference on World Wide Web}, year = {2002}, isbn = {1-58113-449-5}, pages = {562--569}, location = {Honolulu, Hawaii, USA}, doi = {http://doi.acm.org/10.1145/511446.511520}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{jain99data, author = {A. K. Jain and M. N. Murty and P. J. Flynn}, title = {Data clustering: a review}, journal = {ACM Comput. Surv.}, volume = {31}, number = {3}, year = {1999}, issn = {0360-0300}, pages = {264--323}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{ward63hierarchical, author = {Ward, Jr., J. H.}, title = {Hierarchical grouping to optimize an objective function}, journal = {Journal of the American Statistical Association}, volume = 58, pages = {236-244}, year = 1963 } @inproceedings{elhamdouchi86hierarchic, author = {A. El-Hamdouchi and P. Willett}, title = {Hierarchic document classification using Ward's clustering method}, booktitle = {SIGIR '86: Proceedings of the 9th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1986}, isbn = {0-89791-187-3}, pages = {149--156}, location = {Palazzo dei Congressi, Pisa, Italy}, doi = {http://doi.acm.org/10.1145/253168.253200}, publisher = {ACM Press}, address = {New York, NY, USA}, } @article{murtagh83survey, author = {Fionn Murtagh}, title = {A Survey of Recent Advances in Hierarchical Clustering Algorithms.}, journal = {Computer Journal}, volume = {26}, number = {4}, year = {1983}, pages = {354--359} } @article{defays77efficient, author = {Daniel Defays}, title = {An efficient algorithm for a complete link method}, journal = {The Computer Journal}, volume = 20, number = 4, pages = {93--95}, year = 1977 } @Book{cormen90algorithms, author = "Thomas H. Cormen and Charles Eric Leiserson and Ronald L. Rivest", title = "Introduction to Algorithms", publisher = {MIT Press}, address = {Cambridge MA}, pages = "xvii + 1028", year = "1990", ISBN = "0-262-03141-8, 0-07-013143-0 (McGraw-Hill)", ISBN-13 = "978-0-262-03141-7, 978-0-07-013143-9 (McGraw-Hill)", LCCN = "QA76.6 .C662 1990" } @article{day84efficient, author = {William H. Day and Herbert Edelsbrunner}, year = 1984, title = {Efficient Algorithms for Agglomerative Hierarchical Clustering Methods}, journal = {Journal of Classification}, volume =1, pages ={1-24} } @Article{aichholzer96classifying, author = {Oswin Aichholzer and Franz Aurenhammer}, title = {Classifying hyperplanes in hypercubes}, journal = {SIAM Journal on Discrete Mathematics}, year = 1996, volume = 9, number = 2, pages = {225--232} } % note = {[IIG-Report-Series 408, TU Graz, Austria, 1995]}, % abstract = {We consider hyperplanes spanned by vertices of the unit % $d$-cube. We classify these hyperplanes by parallelism to % coordinate axes, by symmetry of the $d$-cube vertices they % avoid, as well as by so-called hull-honesty. (Hull-honest % hyperplanes are those whose intersection figure with the % $d$-cube coincides with the convex hull of the $d$-cube % vertices they contain; they do not cut $d$-cube edges % properly.) We describe relationships between these classes, % and give the exact number of hull-honest hyperplanes, in % general dimensions. An experimental enumeration of all % spanned hyperplanes up to dimension eight showed us the % intrinsic difficulty of developing a general enumeration % scheme. Motivation for considering such hyperplanes stems % from coding theory, from linear programming, and from the % theory of machine learning.} @article{king67stepwise, author = {B. King}, title = {Step-wise clustering procedures}, journal = {J. Am. Stat. Assoc.}, volume = {69}, year = {1967}, pages = {86-101} } @Book{sneath73numerical, author = "Peter H.A. Sneath and Robert R. Sokal", title = "Numerical Taxonomy: The Principles and Practice of Numerical Classification", publisher = "W.H.~Freeman", address = "San Francisco", year = 1973, isbn = "0 7167 0697 0" } @TechReport{voorhees85effectiveness, author = "Ellen M. Voorhees", title = {The Effectiveness and Efficiency of Agglomerative Hierarchic Clustering in Document Retrieval}, institution = "Cornell", year = 1985, number = {TR 85-705} } @unpublished{popescul00automatic, title = {Automatic Labeling of Document Clusters}, author = { Alexandrin Popescul and Lyle H. Ungar}, note = {unpublished}, year = 2000 } @inproceedings{mckeown95generating, author = {Kathleen McKeown and Dragomir R. Radev}, title = {Generating summaries of multiple news articles}, booktitle = {SIGIR '95: Proceedings of the 18th annual international ACM SIGIR conference on Research and development in information retrieval}, year = {1995}, isbn = {0-89791-714-6}, pages = {74--82}, location = {Seattle, Washington, United States}, doi = {http://doi.acm.org/10.1145/215206.215334}, publisher = {ACM Press}, address = {New York, NY, USA}, } @inproceedings{ azcarraga01extracting, author = "Arnulfo P. Azcarraga and Teddy N. {Yap Jr.}", title = "Extracting Meaningful Labels for {WEBSOM} Text Archives", booktitle = "{CIKM}", pages = "41-48", year = "2001"} % url = "citeseer.ist.psu.edu/azcarraga01extracting.html" } @inproceedings{glover02inferring, author = {Eric Glover and David M. Pennock and Steve Lawrence and Robert Krovetz}, title = {Inferring hierarchical descriptions}, booktitle = {CIKM '02: Proceedings of the eleventh international conference on Information and knowledge management}, year = {2002}, isbn = {1-58113-492-4}, pages = {507--514}, location = {McLean, Virginia, USA}, doi = {http://doi.acm.org/10.1145/584792.584876}, publisher = {ACM Press}, address = {New York, NY, USA}, } @incollection{darrell07locality, title = {Locality-sensitive hashing using stable distributions}, booktitle = {Nearest Neighbor Methods in Learning and Vision: Theory and Practice}, author = {A. Andoni and N. Immorlica and P. Indyk and V. Mirrokni}, editors = {T. Darrell and P. Indyk and G. Shakhnarovich}, publisher = {MIT Press}, year =2007 } @inproceedings{hull96document, author = "David Hull and Jan Pedersen and Hinrich Sch{\"u}tze", title = "Document routing as statistical classification", booktitle = "AAAI Spring Symposium on Machine Learning in Information access", year = "1996"} @article{cover67nearest, author = {Thomas M. Cover and Peter E. Hart}, title = {Nearest neighbor pattern classification}, journal = {IEEE Transactions on Information Theory}, volume = {13}, number = {1}, year = {1967}, pages = {21-27} } @inproceedings{yang94expert, author = {Yiming Yang}, title = {Expert network: effective and efficient learning from human decisions in text categorization and retrieval}, booktitle = {SIGIR}, pages = {13--22}, year = 1994 } @inproceedings{turtle94boolean, author = {Howard Turtle}, year = {1994}, title = {Natural language vs. {B}oolean query evaluation: a comparison of retrieval performance}, pages = {212--220}, booktitle = {SIGIR 17} } @inproceedings{lita03truecasing, author = {Lucian Vlad Lita and Abe Ittycheriah and Salim Roukos and Nanda Kambhatla}, title = {{tRuEcasIng}}, booktitle = {ACL 41}, year = 2003, pages = {152--159} } @inproceedings{boldi05skiplists, author = {Paolo Boldi and Sebastiano Vigna}, title = {Compressed perfect embedded skip lists for quick inverted-index lookups}, booktitle = {Proceedings of String Processing and Information Retrieval (SPIRE 2005)}, series = {Lecture Notes in Computer Science}, publisher = {Springer-Verlag}, year = 2005 } @book{manning99foundations, author={Christopher D. Manning and Hinrich Sch{\"u}tze}, title={Foundations of Statistical Natural Language Processing}, year=1999, address={Cambridge, MA}, publisher={MIT Press} } @inproceedings{bahle02phrase, author = {Dirk Bahle and Hugh E. Williams and Justin Zobel}, year = 2002, title = {Efficient Phrase Querying with an Auxiliary Index}, booktitle = {SIGIR 2002}, pages = {215--221} } @article{williams04phrase, author = {Hugh E. Willams and Justin Zobel and Dirk Bahle}, year = 2004, title = {Fast Phrase Querying With Combined Indexes}, journal = {ACM Transactions on Information Systems}, volume = {22}, number = {4}, pages = {573--594} } @inproceedings{sproat03bakeoff, author = {Richard Sproat and Thomas Emerson}, year = 2003, title = {The First International {C}hinese Word Segmentation Bakeoff}, booktitle = {The Second SIGHAN Workshop on Chinese Language Processing} } % Sapporo, Japan, July 2003. %book{witten99, %title = {Managing Gigabytes: Compressing and Indexing Documents and Images}, %author = {Ian H. Witten and Alistair Moffat and Timothy C. Bell}, %year = {1999}, %PUBLISHER = "Morgan Kaufman" %} @book{witten99gigabytes, author = {Ian H. Witten and Alistair Moffat and Timothy C. Bell}, title = {Managing Gigabytes: Compressing and Indexing Documents and Images}, publisher = {Morgan Kaufmann}, address = {San Francisco, CA}, year = 1999, edition = {2nd} } @inproceedings{mccallum98comparison, author = {Andrew McCallum and Kamal Nigam}, title = {A Comparison of Event Models for {N}aive {B}ayes Text Classification}, year = 1998, booktitle = {Working Notes of the 1998 AAAI/ICML Workshop on Learning for Text Categorization}, pages = {41--48} } @article{friedman97bias, author = {Jerome H. Friedman}, title = {On Bias, Variance, 0/1--Loss, and the Curse-of-Dimensionality}, journal = {Data Mining and Knowledge Discovery}, year = {1997}, volume = {1}, number = {1}, pages = {55--77}, annote = {Also, Technical Report, Stanford University, 1996} } @book{duda00pattern, author = {Richard O. Duda and Peter E. Hart and David G. Stork}, title = {Pattern Classification (2nd Edition)}, year = {2000}, isbn = {0471056693}, publisher = {Wiley-Interscience}, } @InCollection{cheeseman90finding, author = "Peter Cheeseman", title = "On Finding the Most Probable Model", booktitle = "Computational Models of Scientific Discovery and Theory Formation", publisher = "Morgan Kaufman", year = 1990, editor = "Jeff Shrager and Pat Langley", pages = "73--95" } @inproceedings{croft78cluster, author = {W. Bruce Croft}, title = {A file organization for cluster-based retrieval}, booktitle = {SIGIR '78: Proceedings of the 1st annual international ACM SIGIR conference on information storage and retrieval}, year = {1978}, pages = {65--82}, publisher = {ACM Press}, address = {New York, NY, USA}, } % doi = {http://doi.acm.org/10.1145/800096.803136}, @article{nigam00em, author = {Kamal Nigam and Andrew Kachites McCallum and Sebastian Thrun and Tom Mitchell}, title = {Text Classification from Labeled and Unlabeled Documents using EM}, journal = {Mach. Learn.}, volume = {39}, number = {2-3}, year = {2000}, issn = {0885-6125}, pages = {103--134}, publisher = {Kluwer Academic Publishers}, address = {Hingham, MA, USA}, } @inproceedings{zhang03modified, author = {Jian Zhang and Rong Jin and Yiming Yang and Alexander G. Hauptmann}, title = {Modified Logistic Regression: An Approximation to SVM and Its Applications in Large-Scale Text Categorization.}, booktitle = {ICML}, year = {2003}, pages = {888-895}, bibsource = {DBLP, http://dblp.uni-trier.de} } % crossref = {DBLP:conf/icml/2003}, @STRING{csli = "Center for the Study of Language and Information"} @STRING{csliort = "Stanford University"} @STRING{cl = "Computational Linguistics"} @STRING{acl79 = "Proceedings of the 17th Annual Meeting of the ACL, University of California at San Diego"} @STRING{acl79o = "La Jolla, Cal."} @STRING{acl80 = "Proceedings of the 18th Annual Meeting of the ACL, University of Pennsylvania"} @STRING{acl80o = "Philadelphia, Pa."} @STRING{acl81 = "Proceedings of the 19th Annual Meeting of the ACL, Stanford University"} @STRING{acl81o = "Stanford, Cal."} @STRING{acl82 = "Proceedings of the 20th Annual Meeting of the ACL, University of Toronto"} @STRING{acl82o = "Toronto, Ont."} @STRING{acl83 = "Proceedings of the 21st Annual Meeting of the ACL, Massachusetts Institute of Technology"} @STRING{acl83o = "Cambridge, Mass."} @STRING{acl84 = "Proceedings of the 22nd Annual Meeting of the ACL"} @STRING{acl85 = "Proceedings of the 23rd Annual Meeting of the ACL, University of Chicago"} @STRING{acl85o = "Chicago, Ill."} @STRING{acl86 = "Proceedings of the 24th Annual Meeting of the ACL, Columbia University"} @STRING{acl86o = "New York, N.Y."} @STRING{acl87 = "Proceedings of the 25th Annual Meeting of the ACL, Stanford University"} @STRING{acl87o = "Stanford, Cal."} @STRING{acl88 = "Proceedings of the 26th Annual Meeting of the ACL, State University of New York at Buffalo"} @STRING{acl88o = "Buffalo, New York"} @STRING{acl89 = "Proceedings of the 27th Annual Meeting of the ACL, University of British Columbia"} @STRING{acl89o = "Vancouver, B.C., Canada"} @STRING{acl90 = "Proceedings of the 28th Annual Meeting of the ACL, University of Pittsburgh"} @STRING{acl90o = "Pittsburgh, PA."} @STRING{acl91 = "Proceedings of the 29th Annual Meeting of the ACL, University of California"} @STRING{acl91o = "Berkeley, California"} @STRING{acl92 = "Proceedings of the 30th Annual Meeting of the ACL, University of Delaware"} @STRING{acl92o = "Newark, Delaware"} @STRING{acl93 = "Proceedings of the 31st Annual Meeting of the ACL, Ohio State University"} @STRING{acl93o = "Columbus, Ohio"} @STRING{acl94 = "Proceedings of the 32nd Annual Meeting of the ACL, New Mexico State University"} @STRING{acl94o = "Las Cruces, New Mexico"} @STRING{acl95 = "Proceedings of the 33rd Annual Meeting of the ACL, Massachusetts Institute of Technology"} @STRING{acl95o = "Cambridge, Mass."} @STRING{acl96 = "Proceedings of the 34th Annual Meeting of the ACL, University of California"} @STRING{acl96o = "Santa Cruz, Cal."} @STRING{acl97 = "Proceedings of the 35th Annual Meeting of the ACL"} @STRING{acl97o = "Madrid, Spain"} @STRING{acl98 = "Proceedings of ACL-COLING'98"} @STRING{acl98o = "Montreal, Canada"} @STRING{acl99 = "Proceedings of the 37th Annual Meeting of the ACL"} @STRING{acl99o = "College Park, MD"} @STRING{acl00 = "Proceedings of the 38th Annual Meeting of the ACL"} @STRING{acl00o = "Hong Kong"} @STRING{acl01 = "Proceedings of the 39th Annual Meeting of the ACL"} @STRING{acl01o = "Toulouse, France"} @STRING{acl02 = "Proceedings of the 40th Annual Meeting of the ACL"} @STRING{acl02o = "Philadelphia, PA"} @STRING{acl03 = "Proceedings of the 41th Annual Meeting of the ACL"} @STRING{acl03o = "Sapporo, Japan"} @STRING{anlp88 = "Proceedings of the Second Conference on Applied Natural Language Processing"} @STRING{anlp92 = "Proceedings of the Third Conference on Applied Natural Language Processing"} @STRING{anlp92o = "Trento, Italy"} @STRING{anlp94 = "Proceedings of the 4th Conference on Applied Natural Language Processing"} @STRING{anlp94o = "Stuttgart, Germany"} @STRING{eacl83 = "Proceedings of the 1st Conference of the European Chapter of the Association for Computational Linguistics"} @STRING{eacl83o= "Pisa, Italy"} @STRING{eacl85 = "Proceedings of the 2nd Conference of the European Chapter of the Association for Computational Linguistics"} @STRING{eacl85o= "Geneva, Switzerland"} @STRING{eacl87 = "Proceedings of the 3rd Conference of the European Chapter of the Association for Computational Linguistics"} @STRING{eacl87o= "Copenhagen, Denmark"} @STRING{eacl89 = "Proceedings of the 4th Conference of the European Chapter of the Association for Computational Linguistics"} @STRING{eacl89o= "Manchester, England"} @STRING{eacl91 = "Proceedings of the 5th Conference of the European Chapter of the Association for Computational Linguistics"} @STRING{eacl91o= "Berlin, Germany"} @STRING{eacl93 = "Proceedings of the 6th Conference of the European Chapter of the Association for Computational Linguistics"} @STRING{eacl93o= "Utrecht, The Netherlands"} @STRING{eacl95 = "Proceedings of the 7th Conference of the European Chapter of the Association for Computational Linguistics"} @STRING{eacl95o= "Dublin"} @STRING{coling84 = "Proceedings of the 10th International Conference on Computational Linguistics"} @STRING{coling84o= "Stanford, Cal."} @STRING{coling86 = "Proceedings of the 11th International Conference on Computational Linguistics"} @STRING{coling86o= "Bonn, West Germany"} @STRING{coling88 = "Proceedings of the 12th International Conference on Computational Linguistics"} @STRING{coling88o= "Budapest, Hungary"} @STRING{coling90 = "Proceedings of the 13th International Conference on Computational Linguistics"} @STRING{coling90o= "Helsinki, Finland"} @STRING{coling92 = "Proceedings of the 14th International Conference on Computational Linguistics"} @STRING{coling92o= "Nantes, France"} @STRING{coling94 = "Proceedings of the 15th International Conference on Computational Linguistics"} @STRING{coling94o= "Kyoto, Japan"} @STRING{coling96 = "Proceedings of the 16th International Conference on Computational Linguistics"} @STRING{coling96o= "Copenhagen, Denmark"} @STRING{coling00 = "Proceedings of the 18th International Conference on Computational Linguistics"} @STRING{coling00o= "Saarbr{\"u}cken, Germany"} @STRING{coling02 = "Proceedings of the 19th International Conference on Computational Linguistics"} @STRING{coling02o= "Taipei, Taiwan"} @STRING{coling04 = "Proceedings of the 20th International Conference on Computational Linguistics"} @STRING{coling04o= "Geneva, Switzerland"} @inproceedings{rubinstein97informative, author = "Y. Dan Rubinstein and Trevor Hastie", title = "Discriminative vs Informative Learning", year = "1997", booktitle = "SIGKDD" } % booktitle = "Sixth ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining", % pages = "49--53" : Discriminative vs Informative Learning. KDD 1997: 49-53 @InProceedings{Schmid:94a, author = "Helmut Schmid", title = "Part-of-Speech Tagging with Neural Networks", pages = "172--176", booktitle = coling94, year = 1994, address = coling94o } @InProceedings{Schmid:94b, author = "Helmut Schmid", title = "Probabilistic Part-of-Speech Tagging Using Decision Trees", pages = "44--49", booktitle = "Proceedings of the International Conference on New Methods in Language Processing", year = 1994, address = "Manchester, UK" } @inproceedings{Schmid:95a, author = "Helmut Schmid", title = "Improvements in Part-of-Speech Tagging with an Application to {G}erman", booktitle = "Proceedings of the ACL SIGDAT-Workshop", pages = "47--50", year = 1995, Adress = "Dublin, Ireland" } @InCollection{Schmid:99, author = "Helmut Schmid", title = "Improvements in Part-of-Speech Tagging with an Application to German", booktitle = "Natural Language Processing Usinf Very Large Corpora", publisher = "Kluwer Academic Publishers", year = 1999, editor = "Susan Armstrong and Kenneth Church and Pierre Isabelle and Sandra Manzi and Evelyne Tzoukermann and David Yarowsky", volume = 11, series = "Text, Speech and Language Processing", address = "Dordrecht", pages = "13--26" } @TechReport{Schmid:95b, author = "Helmut Schmid", title = "Statistische Disambiguierung von Pr{\"a}positionen f{\"u}r den Transfer", institution = "Universit{\"a}t Stuttgart", year = 1995, type = "Verb{\it mobil} Memo", number = 87, month = "Februar" } @InProceedings{Schmid:97, author = "Helmut Schmid", title = "Parsing by Successive Approximation", booktitle = "Proceedings of the 5th International Workshop on Parsing Technologies (IWPT '97)", year = 1997, organization = "Massachusetts Institute of Technology", address = "Cambridge, Ma.", pages = "177--186" } @InCollection{Schmid:97a, author = "Helmut Schmid", title = "Probabilistic Part-of-Speech Tagging Using Decision Trees", booktitle = "New Methods in Language Processing", publisher = "UCL Press", year = 1997, editor = "Daniel Jones and Harold Somers", series = "Studies in Computational Linguistics", address = "London, GB", pages = "154--164" } @PhdThesis{Schmid:00, author = "Helmut Schmid", title = "YAP: Parsing and Disambiguation With Feature-Based Grammars", school = "Institute for Computational Linguistics, University of Stuttgart", year = 2000, address = "Germany" } @InCollection{Schmid:00b, author = "Helmut Schmid", title = "Parsing by successive approximation", booktitle = "Advances in Probabilistic and Other Parsing Technologies", publisher = "Kluwer Academic Publishers", year = 2000, editor = "Harry Bunt and Anton Hijholt", volume = 16, series = "Text, Speech and Language Technology", chapter = 13, address = "Dordrecht", pages = "243-262" } @Book{Schmid:00c, author = "Helmut Schmid", title = "{LoPar}: Design and Implementation", publisher = "Institute for Computational Linguistics, University of Stuttgart", year = 2000, number = 149, series = "Arbeitspapiere des Sonderforschungsbereiches 340" } @TechReport{Schmid:00d, author = "Helmut Schmid", title = "Unsupervised Learning of Period Disambiguation for Tokenisation", institution = "IMS, University of Stuttgart", year = 2000, postscript= "http://www.ims.uni-stuttgart.de/$\sim$schmid/tokeniser.ps", pdf = "http://www.ims.uni-stuttgart.de/$\sim$schmid/tokeniser.pdf" } @InProceedings{Schmid:02a, author = "Helmut Schmid", title = "A Generative Probability Model for Unification-Based Grammars", booktitle = coling02, year = 2002, address = coling02o, pages = "884--890" } @InProceedings{Schmid:02b, author = "Helmut Schmid", title = "Lexicalization of Probabilistic Grammars", booktitle = coling02, year = 2002, address = coling02o, pages = "891--896" } @InProceedings{Schmid/Atterer:04, author = "Helmut Schmid and Michaela Atterer", title = "New Statistical Methods for Phrase Break Prediction", booktitle = coling04, volume = 1, year = 2004, address = coling04o, pages = "659--665" } @InProceedings{Schmid:04b, author = "Helmut Schmid", title = "Efficient Parsing of Highly Ambiguous Context-Free Grammars with Bit Vectors", booktitle = coling04, volume = 1, year = 2004, address = coling04o, pages = "162--168" } @incollection{Schmid:05a, author = "Helmut Schmid", booktitle = "Corpus Linguistics: An International Handbook", title = "Tokenizing", publisher = "Walter de Gruyter", year = "to appear", editor = "Anke L{\"u}deling and Merja Kyt{\"o} and Tony McEnery", series = "Handbooks of Linguistics and Communication Science", address = "Berlin" } @incollection{Schmid:05b, author = "Helmut Schmid", booktitle = "Corpus Linguistics: An International Handbook", title = "Part-of-Speech Tagging", publisher = "Walter de Gruyter", year = "to appear", editor = "Anke L{\"u}deling and Merja Kyt{\"o} and Tony McEnery", series = "Handbooks of Linguistics and Communication Science", address = "Berlin" } @Misc{Schmid:05c, author = "Helmut Schmid", title = "Review of the book: Data-Oriented Parsing by Rens Bod, Remko Scha and Khalil Sima'an", howpublished = "Journal of Logic, Language and Information", year = "to appear" } @InProceedings{Schmid:05d, author = "Helmut Schmid", title = "A Programming Language For Finite State Transducers", booktitle = "Workshop on Finite State Methods in Natural Language Processing", year = 2005, organization = "University of Helsinki", address = "Helsinki, Finland", note = "submitted" } @Manual{YAP-Manual, title = "The YAP Manual", author = "Helmut Schmid", organization = "Institute for Computational Linguistics, University of Stuttgart", address = "Germany" } @InCollection{Schmid/Kempe:96, author = "Helmut Schmid and Andr\'e Kempe", title = "{Tagging} von deutschen {Corpora} mit {HMM}, {Entscheidungsb{\"a}umen} und {Neuronalen} {Netzen}", booktitle = "Lexikon und Text: Wiederverwendbare Methoden und Resourcen zur linguistischen Erschlie{\ss}ung des Deutschen", publisher = "Niemeyer", year = 1996, editor = "Helmut Feldweg und Erhard W. Hinrichs", number = 73, series = "Lexicographica Series Maior", address = "T{\"u}bingen", pages = "231--244" } @article{Stein/Schmid:95, author = "Achim Stein and Helmut Schmid", year = 1995, title = "Etiquetage morphologique de textes fran\c{c}ais avec un arbre de d\'ecisions", journal = "Traitement Automatique des Langues", volume = "Num\'ero sp\'ecial: Traitements probabilistes et corpus", pages = "23--35" } @InProceedings{Schmid/etal:04, author = "Helmut Schmid and Arne Fitschen and Ulrich Heid", title = "{SMOR}: A {G}erman Computational Morphology Covering Derivation, Composition and Inflection", booktitle = "Proceedings of the 4th International Conference on Language Resources and Evaluation", volume = 4, year = 2004, address = "Lisbon, Portugal", pages = "1263--1266" } @InProceedings{Schmid/Schulte:00, author = "Helmut Schmid and Sabine {Schulte im Walde}", title = "Robust {G}erman Noun Chunking With a Probabilistic Context-Free Grammar", booktitle = coling00, year = 2000, address = coling00o, pages = "726--732" } @InProceedings{Schmid/Rooth:01, author = "Helmut Schmid and Mats Rooth", title = "Parse Forest Computation of Expected Governors", booktitle = acl01, year = 2001, address = acl01o, pages = "458--465" } @InCollection{SchulteImWaldeEtAl:01, keywords = "Gramotron", author = "Sabine {Schulte im Walde} and Helmut Schmid and Mats Rooth and Stefan Riezler and Detlef Prescher", title = "Statistical {G}rammar {M}odels and {L}exicon {A}cquisition", booktitle = "Linguistic Form and its Computation", publisher = "CSLI Publications", year = 2001, editor = "Christian Rohrer and Antje Rossdeutscher and Hans Kamp", address = "Stanford, CA", pages = "389--440" } @TechReport{Schmid/Prescher, author = "Detlef Prescher and Helmut Schmid", title = "Using Pseudo-Disambiguation an Perplexity-Measures for Evalutation of Probabilistic Verb-Argument Models", institution = "IMS, University of Stuttgart", year = 2002, type = "internal report", address = "Stuttgart, Germany" } @inproceedings{klein03accurate, author = {Dan Klein and Christopher D. Manning}, title = {Accurate Unlexicalized Parsing}, booktitle = {Proceedings of the 41st Annual Meeting of the Association for Computational Linguistics}, year = 2003, } @article{collins03reranking, author = {Collins, Michael and Koo, Terry }, citeulike-article-id = {159099}, doi = {10.1162/0891201053630273}, issn = {0891-2017}, journal = {Computational Linguistics}, month = {March}, number = {1}, pages = {25--70}, publisher = {MIT Press}, title = {Discriminative Reranking for Natural Language Parsing}, url = {http://www.ingentaconnect.com/content/mitpress/coli/2005/00000031/00000001/art00003}, volume = {31}, year = {2005} } @inproceedings{charniak05maxent, author = {Eugene Charniak and Mark Johnson}, title = {Coarse-to-fine n-best parsing and MaxEnt discriminative reranking}, booktitle = {ACL 43}, year = {2005} } @inproceedings{collins97three, author = {Michael Collins}, title = {Three generative, lexicalised models for statistical parsing}, booktitle = {Proceedings of the 35th annual meeting on Association for Computational Linguistics}, year = {1997}, pages = {16--23}, location = {Madrid, Spain}, publisher = {Association for Computational Linguistics}, address = {Morristown, NJ, USA}, } @InProceedings{Calvo/etal:2005, author = {Hiram Calvo and Alexander Gelbukh and Adam Kilgarriff}, title = {Distributional Thesaurus vs.\ WordNet: A Comparison of Backoff Techniques for Unsupervised {PP} Attachment}, booktitle = {Procs. of the sixth International Conference on Intelligetn Text Processing and Computational Linguistics (CICLing)}, year = 2005 } @InProceedings{Lin:1998, author = {Dekang Lin}, title = {Dependency-based evaluation of {MINIPAR}}, booktitle = {Workshop on the Evaluation of Parsing Systems}, year = 1998, address = {Granada, Spain} } @Article{Littlestone:1988, author = {N. Littlestone}, title = {Learning quickly when irrelevant attributes abound: A new linear threshold algorithm.}, journal = {Machine Learning}, year = 1988, volume = 2, pages = {285-318} } @Article{Marcus/etal:1993, author = {Mitchell P. Marcus and Beatrice Santorini and Mary Ann Marcinkiewicz}, title = {Building a large natural language corpus of {E}nglish: the Penn Treebank}, journal = {Computational Linguistics}, year = 1993, volume = 19, pages = {313-330} } @TechReport{Miller/etal:1993, author = {George A.\ Miller and Richard Beckwith and Christiane D.\ Fellbaum and Derek Gross and Katherine Miller}, title = {Five Papers on WordNet}, institution = {Princeton University, Princeton, N.J.}, year = 1993 } @InProceedings{Siddharthan/etal:2004, author = {Advaith Siddharthan and Ani Nenkova and Kathleen McKeown}, title = {Syntactic Simplification for Improving Content Selection in Multi-Document Summarization}, booktitle = {COLING}, year = 2004 } @InProceedings{Siddharthan:2002, author = {Advaith Siddharthan}, title = {Resolving Attachment and Clause Boundary Ambiguities for Simplifying Relative Clause Constructs}, booktitle = {Proceedings of the Student Research Workshop, 40th Meeting of the Association for Computational Linguistics (ACL 2002)}, year = 2002 } @InProceedings{Siddharthan:2002b, author = {Advaith Siddharthan}, title = {Resolving Relative Clause Attachment Ambiguities using Machine Learning Techniques and WordNet Hierarchies.}, booktitle = {Proceedings of the 4th Discourse Anaphora and Anaphor Resolution Colloquium (DAARC 2002)}, year = 2002 } @InProceedings{Volk:2001, author = {Martin Volk}, title = {Exploiting the {WWW} as a corpus to resolve PP attachment ambiguities}, booktitle = {Proceedings of Corpus Linguistics 2001}, year = 2001 } @inproceedings{Yeh/Vilain:1998, author = {Alexander S. Yeh and Marc B. Vilain}, title = {Some properties of preposition and subordinate conjunction attachments}, booktitle = {Proceedings of the 17th international conference on Computational linguistics}, year = {1998}, pages = {1436--1442}, location = {Montreal, Quebec, Canada}, publisher = {Association for Computational Linguistics}, address = {Morristown, NJ, USA}, } @phdthesis{volz05nutzung, title = { Modellierung und Nutzung von Relationen zwischen Mehrfachrepraesentationen in Geo-Informationssystemen }, author = {Steffen Volz}, school = {University of Stuttgart}, year = 2005, note = {Submitted.} } @inproceedings{do02coma, author = "H. Do and E. Rahm", title = "COMA - A System for Flexible Combination of Schema Matching Approaches", booktitle = {VLDB}, year = "2002"} @inproceedings{markowetz05geographic, title = {Design and Implementation of a Geographic Search Engine}, author = {Alexander Markowetz and Yen-Yu Chen and Torsten Suel and Xiaohui Long and Bernhard Seeger}, booktitle = {8th International Workshop on the Web and Databases}, year = 2005 } @inproceedings{kuhn03implementing, author = {Werner Kuhn and Martin Raubal}, title = {Implementing Semantic Reference Systems}, booktitle = { AGILE}, year = 2003 } @inproceedings{frank03spatio, author = {Andrew U. Frank}, title = {Ontology for Spatio-temporal Databases.}, booktitle = {Spatio-Temporal Databases: The CHOROCHRONOS Approach}, year = {2003}, pages = {9-77}, ee = {http://springerlink.metapress.com/openurl.asp?genre=article{\&}issn=0302-9743{\&}volume=2520{\&}spage=9}, crossref = {DBLP:conf/chorochronos/2003}, bibsource = {DBLP, http://dblp.uni-trier.de} } @proceedings{DBLP:conf/chorochronos/2003, editor = {Manolis Koubarakis and Timos K. Sellis and Andrew U. Frank and St{\'e}phane Grumbach and Ralf Hartmut G{\"u}ting and Christian S. Jensen and Nikos A. Lorentzos and Yannis Manolopoulos and Enrico Nardelli and Barbara Pernici and Hans-J{\"o}rg Schek and Michel Scholl and Babis Theodoulidis and Nectaria Tryfona}, title = {Spatio-Temporal Databases: The CHOROCHRONOS Approach}, booktitle = {Spatio-Temporal Databases: The CHOROCHRONOS Approach}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, volume = {2520}, year = {2003}, isbn = {3-540-40552-6}, bibsource = {DBLP, http://dblp.uni-trier.de} } @misc{ fonseca02using, author = "F. Fonseca and M. Egenhofer and P. Agouris and G. Camara", title = "Using ontologies for integrated geographic information systems", journal = {Transactions in Geographic Information Systems}, volume =6, number =3, year = "2002" } % url = "citeseer.ist.psu.edu/fonseca02using.html" } @techreport{bateman04baseline, year = 2004, author = {John Bateman and Scott Farrar}, title = {Spatial Ontology Baseline}, institution = {University of Bremen} } @misc{bofinger01analyse, title = {Analyse und Implementierung eines Verfahrens zur Referenzierung geographischer Objekte}, author = {Jan-Martin Bofinger}, school = {Universitaet Stuttgart, IfP}, note = {Diplomarbeit}, year = 2001 } @inproceedings{baehr04language, author = {Hans-Peter Baehr and Marina Mueller}, title = {Graphics and Language as Complementary Formal Representations for Geospatial Descriptions}, booktitle = {Proceedings of ISPRS}, year = {2004}, pages = {216--221} } @book{manning07retrieval, author = {Christopher D. Manning and Prabhakar Raghavan and Hinrich Sch\"{u}tze}, title = {Introduction to Information Retrieval}, publisher = {Cambridge University Press}, year = 2007, note = {To appear} } @incollection{chang05biology, year = 2005, author = {Jeff T. Chang and Hinrich Sch\"{u}tze}, title = {Abbreviations in Biomedical Text}, editor = {Ben Stapley and Sophia Ananiadou}, booktitle = {Text Mining for Biology}, publisher = {Artech House Books}, note = {To appear} } @article{chang02creating, author = {Jeff T. Chang and Hinrich Sch\"{u}tze and Russ B. Altman}, title = {Creating an Online Dictionary of Abbreviations from MEDLINE}, journal = {The Journal of the American Medical Informatics Association}, year = 2002, volume = 9, number = 6, pages = {612--620} } @inproceedings{kaplan04speed, author = { Ron Kaplan and Stefan Riezler and Tracy H. King and John T. Maxwell III and Alex Vasserman and Richard Crouch }, title = {Speed and Accuracy in Shallow and Deep Stochastic Parsing}, booktitle = {HLT}, year = {2004} } @inproceedings{schiehlen03shallow, author = {Michael Schiehlen}, title = {Combining Deep and Shallow Approaches in Parsing German}, booktitle = {ACL}, year = {2003} } @inproceedings{burke04resources, author = {Michael Burke and Aoife Cahill and Ruth O'Donovan and Josef van Genabith and Andy Way}, title = {Treebank-Based Acquisition of Wide-Coverage, Probabilistic LFG Resources: Project Overview, Results and Evaluation}, year = 2004, booktitle = {Beyond shallow analyses - Formalisms and statistical modeling for deep analyses. IJCNLP Workshop} } @inproceedings{cahill03multilingual, title = {Treebank-Based Multilingual Unification-Grammar Development}, author = {A. Cahill and M. Forst and M. McCarthy and R. O' Donovan and C. Rohrer and J. van Genabith and A. Way}, booktitle = {Workshop on Ideas and Strategies for Multilingual Grammar Development at the 15th European Summer School in Logic Language and Information}, year = 2003 } %, Vienna, Austria, 18th - 29th August 2003 @article{foth05weighted, author = {Kilian Foth and Wolfgang Menzel and Ingo Schr\"oder}, title = {Robust parsing with weighted constraints}, journal = {Natural Language Engineering}, year = 2005, pages = {1--25}, volume=11, number=1 } @article{bikel04collins, author = {Daniel M. Bikel}, title = {Intricacies of Collins' Parsing Model}, journal = {Computational Linguistics}, year = 2004, pages = {479--511}, volume=30, number=4 } @article{friedman97biasxxx, journal = {Data Mining and Knowledge Discovery}, title = {On Bias, Variance, 0/1-Loss, and the Curse-of-Dimensionality}, author = {Jerome H. Friedman}, year = 1997, pages = {55--77}, volume =1, number =1 } @inproceedings{ mccallum98comparisonxxx, author = "A. McCallum and K. Nigam", title = "A comparison of event models for Naive Bayes text classification", booktitle = {AAAI-98 Workshop on Learning for Text Categorization}, text = {A. McCallum and K. Nigam. A comparison of event models for Naive Bayes text classification. In AAAI-98 Workshop on Learning for Text Categorization, 1998.}, year = "1998", url = "citeseer.ist.psu.edu/mccallum98comparison.html" } @article{sebastiani02automated, author = {Fabrizio Sebastiani}, title = {Machine Learning in Automated Text Categorization}, journal = {ACM Computing Surveys}, volume = 34, number = 1, pages = {1--47}, year = 2002 } @inproceedings{yang97selection, author = {Yiming Yang and Jan Pedersen}, title = {Feature selection in statistical learning of text categorization}, booktitle = {ICML}, year = 1997 } %, pages 412-420, 1997. @inproceedings{li03loss, author = {Fan Li and Yiming Yang}, title = {A Loss Function Analysis for Classification Methods in Text Categorization.}, booktitle = {ICML}, year = {2003}, pages = {472-479} } # crossref = {DBLP:conf/icml/2003}, # bibsource = {DBLP, http://dblp.uni-trier.de} @book{voorhees05experiment, editor = {Ellen M. Voorhees and Donna Harman}, title = {TREC: Experiment and Evaluation in Information Retrieval}, publisher = {MIT press}, year = 2005 } @article{elias75universal, author = {Peter Elias}, title = {Universal Code word sets and representations of the integers}, journal = {IEEE Transactions on Information Theory}, volume = 21, number = 2, year = 1975, pages = {194--203} } @inproceedings{carmel02affinities, author = {David Carmel and Eitan Farchi and Yael Petruschka and Aya Soffer}, title = {Automatic query refinement using lexical affinities with maximal information gain}, booktitle = {SIGIR}, year = 2002 } %pages : 283-290 @inproceedings{lin98minipar, author = {Dekang Lin}, title = {Dependency-based Evaluation of MINIPAR}, booktitle = {Workshop on the Evaluation of Parsing Systems}, year = 1998 } @inproceedings{zens04phrasebased, author = {Zens, Richard and Ney, Hermann}, title = {Improvements in Phrase-Based Statistical Machine Translation}, booktitle = {HLT-NAACL 2004: Main Proceedings }, editor = {Susan Dumais, Daniel Marcu and Salim Roukos}, year = 2004, month = {May 2 - May 7}, address = {Boston, Massachusetts, USA}, publisher = {Association for Computational Linguistics}, pages = {257--264} } @inproceedings{schmid04bit, title = {Efficient Parsing of Highly Ambiguous Context-Free Grammars with Bit Vectors}, booktitle = {Coling}, year = 2004, author = {Helmut Schmid} } @phdthesis{boersma98functional, title = {Functional phonology: Formalizing the interactions between articulatory and perceptual drives}, author = {Paul Boersma}, school = {University of Amsterdam}, year = 1998 } @article{keller02optimality, author = {Frank Keller and Ash Asudeh}, title = {Probabilistic Learning Algorithms and Optimality Theory}, journal = {Linguistic Inquiry}, volume = 33, number=2, pages = {225--244}, year = 2002 } @book{smolensky05harmonic, author = {Paul Smolensky and G\'{e}raldine Legendre}, title = {The Harmonic Mind: From Neural Computation To Optimality-Theoretic Grammar}, publisher = {MIT Press}, year = 2005 } @inproceedings{ roy01toward, author = "Nicholas Roy and Andrew McCallum", title = "Toward Optimal Active Learning through Sampling Estimation of Error Reduction", booktitle = {ICML}, year = "2001", url = "citeseer.ist.psu.edu/roy01toward.html" } % booktitle = "Proc. 18th International Conf. on Machine Learning", % publisher = "Morgan Kaufmann, San Francisco, CA", % pages = "441--448", @article{baram04choice, author = {Yoram Baram and Ran El-Yaniv and Kobi Luz}, title = {Online Choice of Active Learning Algorithms}, journal = {J. Mach. Learn. Res.}, volume = 5, pages = {255--291}, year = 2004 } @inproceedings{iyengar00adaptive, author = "Vijay S. Iyengar and Chidanand Apte and Tong Zhang", title = "Active Learning using Adaptive Resampling", year = "2000", booktitle = "SIGKDD", url = "citeseer.ist.psu.edu/iyengar00active.html" } % booktitle = "Sixth ACM SIGKDD Intl. Conf. on Knowledge Discovery and Data Mining", % pages = "92--98" @article{freund97committee, author = {Yoav Freund and H. Sebastian Seung and Eli Shamir and Naftali Tishby}, title = {Selective Sampling Using the Query by Committee Algorithm}, journal = {Mach. Learn.}, volume = {28}, number = {2-3}, year = {1997}, pages = {133--168} } % issn = {0885-6125}, % publisher = {Kluwer Academic Publishers}, @article{ breiman96bagging, author = "Leo Breiman", title = "Bagging Predictors", journal = "Mach. Learn.", volume = "24", number = "2", pages = "123-140", year = "1996", url = "citeseer.ist.psu.edu/breiman96bagging.html" } @article{ domingos97optimality, author = "Pedro Domingos and Michael J. Pazzani", title = "On the Optimality of the Simple {B}ayesian Classifier under Zero-One Loss", journal = "Mach. Learn.", volume = "29", number = "2-3", pages = "103-130", year = "1997", url = "citeseer.ist.psu.edu/domingos97optimality.html" } @inproceedings{riezler02wall, author = {Stefan Riezler and Tracy H. King and Ronald M. Kaplan and Richard S. Crouch and John T. Maxwell III and Mark Johnson}, title = {Parsing the Wall Street Journal using a Lexical-Functional Grammar and Discriminative Estimation Techniques.}, booktitle = {ACL}, year = {2002}, pages = {271-278}, ee = {http://www.aclweb.org/anthology/P02-1035.pdf}, bibsource = {DBLP, http://dblp.uni-trier.de} } @book{kuhn03theoretic, author = {Jonas Kuhn}, title = {Optimality-Theoretic Syntax -- A Declarative Approach}, publisher = {CSLI Publications}, year = 2003 } @incollection{bresnan99unmarked, author = {Joan Bresnan}, year = {2000}, title = {The Emergence of the Unmarked Pronoun}, note = {To appear}, booktitle = {Optimality-theoretic Syntax}, editor = {Geraldine Legendre and Sten Vikner and Jane Grimshaw}, publisher = {The MIT Press} } @book{dupre93disorder, author = {John Dupr\'{e}}, title = {The Disorder of Things}, publisher = {Harvard University Press}, year = 1993 } @book{bod03dataoriented, author = {Rens Bod and Remko Scha and Khalil Sima$'$an}, title = {Data-Oriented Parsing}, publisher = {CSLI Publications}, year = 2003 } @techreport{kirchner99preliminary, year = 1999, author = {Robert Kirchner}, title = {Preliminary Thoughts on Phonologization within an Exemplar-Based Speech Processing System}, institution = {UCLA Working Papers in Linguistics}, address = {Los Angeles CA} } @article{pulvermueller99words, author = {Friedemann Pulverm\"{u}ller}, year = 1999, title = {Words in brain s language}, journal = {Behavioral and Brain Science}, volume = 22, pages = {253-336} } @unpublished{haspelmath04frequentist, author={Martin Haspelmath}, title={A frequentist explanation of some universals of reflexive marking}, note={Handout}, year=2004 } @incollection{bybee01liaison, author = {Joan Bybee}, year = 2001, title = {Frequency effects on French liaison}, booktitle = {Frequency effects and Emergent Grammar}, editor = {Joan Bybee and Paul Hopper}, address = {Amsterdam}, publisher = {John Benjamins}, pages = {337--359} } @incollection{manning02syntax, author = {Chris Manning}, title = {Probabilistic Syntax}, editor = {Rens Bod and Jennifer Hay and Stefanie Jannedy}, booktitle = {Probabilistic Linguistics}, address = {Cambridge MA}, publisher = {MIT Press}, year=2002 } @incollection{ abney96statistical, author = "Steven Abney", title = "Statistical Methods and Linguistics", booktitle = "The Balancing Act: Combining Symbolic and Statistical Approaches to Language", publisher = "The {MIT} Press", editor = "Judith Klavans and Philip Resnik", pages = "1--26", year = "1996" } %, url = "citeseer.ist.psu.edu/abney96statistical.html" %, Massachusetts", % address = "Cambridge MA", @InProceedings{evert04morphosyntactic, author = {Evert, Stefan}, title = {The Statistical Analysis of Morphosyntactic Distributions}, booktitle = {Proc.\ of LREC}, pages = {1539--1542}, year = 2004, address = {Lisbon, Portugal}, postscript = "http://www.ims.uni-stuttgart.de/projekte/corplex/paper/evert/Evert2004b.ps.gz", pdf = "http://www.ims.uni-stuttgart.de/projekte/corplex/paper/evert/Evert2004b.pdf" } @InProceedings{carroll98valence, author = {Carroll, Glen and Rooth, Mats}, title = {Valence Induction with a Head-lexicalized {PCFG}}, booktitle = {Proc.\ of EMNLP}, year = 1998, address = {Granada, Spain}, annote = {reference for English Gramotron} } @InProceedings{schmid94decision, author = {Schmid, Helmut}, title = {Probabilistic Part-of-Speech Tagging Using Decision Trees}, booktitle = {Proc.\ of the International Conference on New Methods in Language Processing (NeMLaP)}, pages = {44--49}, year = 1994 } @unpublished{schmid05morphological, author={Helmut Schmid}, title={Disambiguation of Morphological Structure using a PCFG}, note={Submitted}, year=2005 } @inproceedings{voorhees03evaluating, author = "Ellen M Voorhees", title = "Evaluating the Evaluation", booktitle = "Human Language Technology", year = "2003" } % % Voorhees, E. M. (INFORMATION ACCESS DIVISION - 894) %Evaluating the Evaluation: A Case Study Using the TREC 2002 Question Answering Track %International Conference on Human Language Technology , May, 2003 , Edmonton, Canada @article{ zhang01text, author = "Tong Zhang and Frank J. Oles", title = "Text Categorization Based on Regularized Linear Classification Methods", journal = "Information Retrieval", volume = "4", number = "1", publisher = "Kluwer Academic Publishers", pages = "5--31", year = "2001", url = "citeseer.ist.psu.edu/zhang00text.html" } @article{saartsechansky04sampling, author = {Maytal Saar-Tsechansky and Foster Provost}, title = {Active Sampling for Class Probability Estimation and Ranking}, journal = {Mach. Learn.}, volume = {54}, number = {2}, year = {2004}, issn = {0885-6125}, pages = {153--178}, doi = {http://dx.doi.org/10.1023/B:MACH.0000011806.12374.c3}, publisher = {Kluwer Academic Publishers}, } @techreport{smith86designing, year = 1986, author = {Sidney L. Smith and Jane N. Mosier}, title = {Guidelines for Designing User Interface Software}, institution = {MITRE}, number = {ESD-TR-86-278} } %address = {Bedford MA}, % note = {http://www.deakin.edu.au/$\tilde{\ }$malcolmc/hci} @misc{smith86designing2, author = {malcolm?}, title = {Smith and Mosier HCI Guidelines, Section 3.0}, year = {2005}, note = {http://www.deakin.edu.au/$\tilde{\ }$malcolmc/hci/hci_sect3.0.html} } @article{cohn94active, author = {David Cohn and Les Atlas and Richard Ladner}, title = {Improving Generalization with Active Learning}, journal = {Mach. Learn.}, volume = {15}, number = {2}, year = {1994}, issn = {0885-6125}, pages = {201--221}, doi = {http://dx.doi.org/10.1023/A:1022673506211}, publisher = {Kluwer Academic Publishers}, } @inproceedings{ seung92query, author = "H. S. Seung and Manfred Opper and Haim Sompolinsky", title = "Query by Committee", booktitle = "Computational Learning Theory", pages = "287-294", year = "1992", url = "citeseer.ist.psu.edu/seung92query.html" } @book{dontknow, author = {a. replacethis}, title = {no title}, publisher = {no publisher}, address = {no address}, year = 2099 } @inproceedings{ blum98combining, author = "Avrim Blum and Tom Mitchell", title = "Combining Labeled and Unlabeled Data with Co-training", year = "1998", booktitle = "COLT", url = "citeseer.ist.psu.edu/blum98combining.html" } % booktitle = "{COLT}: Proc.\ of the Workshop on Computational Learning Theory, Morgan Kaufmann Publishers", @inproceedings{ mccallum98employing, author = {Andrew K. McCallum and Kamal Nigam}, title = {Employing {EM} in pool-based active learning for text classification}, booktitle = {ICML}, year = {1998}, url = {citeseer.ist.psu.edu/mccallum98employing.html} } % booktitle = {Proc.\ of {ICML}-98, 15th International Conference on Machine Learning}, % publisher = {Morgan Kaufmann Publishers, San Francisco, US}, % address = {Madison, US}, % editor = {Jude W. Shavlik}, % pages = {350--358}, @inproceedings{muslea02multiview, author = {Ion Muslea and Steven Minton and Craig A. Knoblock}, title = {Active + Semi-supervised Learning = Robust Multi-View Learning}, year = {2002}, booktitle = {ICML}, isbn = {1-55860-873-7} } % booktitle = {ICML '02: Proc.\ of the Nineteenth International Conference on Machine Learning}, % pages = {435--442}, % publisher = {Morgan Kaufmann Publishers Inc.}, @book{vapnik82dependencies, author = {Vladimir N. Vapnik}, title = {Estimation of Dependencies Based on Empirical Data}, publisher = {Springer}, address = {Berlin}, year = 1982 } @article{lewis04benchmark, author = {David D. Lewis and Yiming Yang and Tony G. Rose and Fan Li}, title = {{RCV1}: A New Benchmark Collection for Text Categorization Research}, journal = {J. Mach. Learn. Res.}, volume = {5}, year = {2004}, issn = {1533-7928}, pages = {361--397}, publisher = {MIT Press}, } @inproceedings{iyengar00resampling, author = {Vijay S. Iyengar and Chidanand Apte and Tong Zhang}, title = {Active learning using adaptive resampling}, booktitle = {KDD '00: Proc.\ of the sixth ACM SIGKDD international conference on Knowledge discovery and data mining}, year = {2000}, isbn = {1-58113-233-6}, pages = {91--98}, location = {Boston, Massachusetts, United States}, doi = {http://doi.acm.org/10.1145/347090.347110}, publisher = {ACM Press}, } @unpublished{lewis01uncertainty, author={David D. Lewis}, title={Training text classifiers by uncertainty sampling}, note={Manuscript, AT\&T Labs}, year=2001 } @article{tong01active, author={Simon Tong and Daphne Koller}, year=2001, title={Support Vector Machine Active Learning with Applications to Text Classification}, journal = {J. Mach. Learn. Res.}, volume=2, pages = {45--66} } Support Vector Machine Active Learning with Applications to Text Classification. Simon Tong, Daphne Koller. Journal of Machine Learning Research. Volume 2, pages 45-66. 2001.. (gzip PS). Also available as (PS). @inproceedings{ schohn00less, author = {Greg Schohn and David Cohn}, title = {Less is More: {A}ctive Learning with Support Vector Machines}, booktitle = {ICML}, year = {2000}, url = {citeseer.ist.psu.edu/schohn00less.html} } % booktitle = {Proc. 17th International Conf. on Machine Learning}, % publisher = {Morgan Kaufmann, San Francisco, CA}, % pages = {839--846}, @book{woods86statistics, author = {Anthony Woods and Paul Fletcher and Arthur Hughes}, title = {Statistics in Language Studies}, publisher = {Cambridge University Press}, year = 1986 } % Stephan Winter, Silvia Nittel: Formal information % modelling for standardisation in the spatial % domain. International Journal of Geographical % nInformation Science 17(8): 721-741 (2003) @inproceedings{beil99inside, title = {Inside-Outside Estimation of a Lexicalized PCFG for German}, author = { Franz Beil and Glenn Carroll and Detlef Prescher and Stefan Riezler and Mats Rooth }, booktitle = "Proc.\ of ACL", year = 1999 } @inproceedings{rooth99inducing, author = { Mats Rooth and Stefan Riezler and Detlef Prescher and Glenn Carroll and Franz Beil }, title = {Inducing a Semantically Annotated Lexicon via EM-Based Clustering}, booktitle = "Proc.\ of ACL", year = 1999 } @unpublished{schuetze05active, author={Hinrich Sch\"utze}, title={Learnability in Active Learning}, note={Submitted}, year=2005 } @book{joachims2002classify, author = {Thorsten Joachims}, title = {Learning to Classify Text using Support Vector Machines}, publisher = {Kluwer}, year = 2002 } @book{hastie2001elements, title = {The Elements of Statistical Learning: Data Mining, Inference, and Prediction}, author = {Trevor Hastie and Robert Tibshirani and Jerome H. Friedman}, publisher = {Springer Verlag}, address = {New York}, year = 2001 } @inproceedings{ lewis94sequential, author = "David D. Lewis and William A. Gale", title = "A sequential algorithm for training text classifiers", booktitle = {SIGIR}, year = "1994", url = "citeseer.ist.psu.edu/lewis94sequential.html" } % booktitle = "Proc.\ of {SIGIR}-94, 17th {ACM} International Conference on Research and Development in Information Retrieval", % publisher = "Springer Verlag, Heidelberg, DE", % address = "Dublin, IE", % editor = "W. Bruce Croft and Cornelis J. van Rijsbergen", % pages = "3--12", @book{klotz00grammatik, title = {Grammatik und Lexik}, author = {Michael Klotz}, year = 2000, publisher = {Stauffenburg}, address = {T\"ubingen} } @InProceedings{hull96trec5, author = {Hull, David A. and Grefenstette, Gregory and Schulze, B. Maximilian and Gaussier, Eric and Sch\"utze Hinrich and Pedersen, Jan O.}, title = {Xerox {TREC}-5 Site Report: Routing, Filtering, {NLP}, and {S}panish Tracks}, booktitle = {NIST Special Publication 500-238: The Fifth Text REtrieval Conference (TREC-5)}, pages = {167--180}, year = {1997}, editor = {Voorhees, E. M. and Harman, D. K.}, address = "Gaithersburg, MD, USA", publisher = {Department of Commerce, National Institute of Standards and Technology}, } @article{pitkow02personalized, author = {James Pitkow and Hinrich Sch\"{u}tze and Todd Cass and Rob Cooley and Don Turnbull and Andy Edmonds and Eytan Adar and Thomas Breuel}, title = {Personalized search}, journal = {Communications of the ACM}, volume = {45}, number = {9}, year = {2002}, issn = {0001-0782}, pages = {50--55}, doi = {http://doi.acm.org/10.1145/567498.567526}, publisher = {ACM Press}, } @inproceedings{schwering04case, author = {A. Schwering and G. Hart}, year = 2004, title = {A Case Study for Semantic Translation of the Water Framework Directive and a Topographic Database}, booktitle = {7th Conference on Geographic Information Science (AGILE)}, pages = {503--510} } @inproceedings{klien04architecture, author = {E. Klien and U. Einspanier and M. Lutz and S. H\"{u}bner}, year = 2004, title = {An Architecture for Ontology-Based Discovery and Retrieval of Geographic Information}, booktitle = {7th Conference on Geographic Information Science (AGILE)}, pages = {179--188} } @article{harvey99central, author = {F. Harvey and W. Kuhn and H. Pundt and Y. Bishr and C. Riedemann}, year = 1999, title = {Semantic Interoperability: A Central Issue for Sharing Geographic Information}, journal ={Annals of Regional Science}, volume = 33, number = 2, pages = {213-232} } @article{Kuhn2001, author = {Kuhn, Werner}, title = {Ontologies in support of activities in geographical space}, journal = {International Journal of Geographical Information Science}, year = {2001}, volume = {15}, number = {7}, pages = {679--687} } @misc{schuetze04nexusquery, author = {Hinrich Sch\"{u}tze}, title = {Results of the query {\em Lehenviertel}}, year = {2004}, note = {http://www.ims.uni-stuttgart.de/$\tilde{\ }$schuetze/publish/20041102nexus2.html} } @inproceedings{noy99smart, title = {SMART: Automated Support for Ontology Merging and Alignment}, author = { Natalya Fridman Noy and Mark A. Musen}, year = 1999, booktitle = {Banff Workshop on Knowledge Acquisition, Modeling, and Management} } @misc{hovy04sensus, author = {Eduard Hovy and Kevin Knight and Mike Junk}, title = {Large Resources: ONTOLOGIES (sensus) and lexicons}, year = {2004}, note = {http://www.isi.edu/natural-language/projects/ONTOLOGIES.html} } @phdthesis{evert04statistics, title = {The statistics of Word Cooccurrences: Word Pairs and Collocations}, author = {Stefan Evert}, school = {University of Stuttgart}, year = 2004 } @inproceedings{ agirre00enriching, author = {E. Agirre and O. Ansa and E. Hovy and D. Martinez}, title = {Enriching Very Large Ontologies Using the WWW}, text = {E. Agirre, O. Ansa, E. Hovy and D. Martinez, Enriching Very Large Ontologies Using the WWW, in Proc. of the Ontology Learning Workshop, ECAI, Berlin, Germany, 2000.}, booktitle = {Proc. of the Ontology Learning Workshop, ECAI}, year = {2000}, url = {citeseer.ist.psu.edu/agirre00enriching.html} } @inproceedings{ riloff99multilevel, author = {Ellen Riloff and Rosie Jones}, year = 1999, title = {Learning Dictionaries for Information Extraction by Multi-level Boot-strapping}, booktitle = {Proc.\ of the Sixteenth National Conference on Artificial Intelligence}, publisher = {{The AAAI Press/MIT Press}}, Pages = {1044-1049}, url = {citeseer.ist.psu.edu/riloff99learning.html} } @article{cohen03webpages, author = {William W. Cohen}, title = {Learning and Discovering Structure in Web Pages}, year = 2003, journal = {IEEE Data Eng. Bull.}, volume = 26, number = 3, pages = {3--10} } @inproceedings{jakob05exploring, author = {Mih\'{a}ly Jakob and Matthias Grossmann and Nicola H\"{o}nle and Daniela Nicklas}, title = {DCbot: Exploring the Web as Value-added Service for Location-based Applications}, booktitle = {International Conference on Data Engineering}, note = {Demo.}, year = 2005 } @misc{bateman04ontospace, author = {John Bateman}, title = {Project I1: Ontospace}, year = {2004}, note = {http://www.sfbtr8.uni-bremen.de/I1}, doi = {http://www.sfbtr8.uni-bremen.de/I1} } @article{chang04gapscore, author = {Jeffrey T. Chang and Hinrich Sch\"{u}tze and Russ B. Altman}, title = {GAPSCORE: finding gene and protein names one word at a time}, journal = { Bioinformatics}, volume = 20, number = 2, pages = {216--225}, year = 2004 } @article{raychaudhuri03inclusion, author = {Soumya Raychaudhuri and Hinrich Sch\"{u}tze and Russ B. Altman}, title = {Inclusion of Textual Documentation in the Analysis of Multidimensional Data Sets: Application to Gene Expression Data}, journal = {Mach. Learn.}, volume = 52, number = {1-2}, pages = {119--145}, year = 2003 } @misc{chen99multimodal, author = {F. Chen and U. Gargi and L. Niles and H. Sch\"{u}tze}, title = {Multi-modal browsing of images in web documents}, text = {F. Chen, U. Gargi, L. Niles, and H. Schtze, Multi-modal browsing of images in web documents, Proc. SPIE Document Recognition and Retrieval, 1999.}, year = {1999}, url = {citeseer.ist.psu.edu/chen99multimodal.html} } @article{rossdeutscher94remarks, author = {Antje Rossdeutscher and Hans Kamp}, year= 1994, title ={Remarks on Lexical Structure and DRS Construction}, journal = {Theoretical Linguistics}, volume= 20, number={2/3}, pages={97--164} } @inproceedings{kuebler01from, author = {Sandra K\"{u}bler and Erhard W. Hinrichs}, title = {From chunks to function-argument structure: A similarity-based approach}, booktitle = {ACL}, pages ={338--345}, year = "2001", url = "citeseer.ist.psu.edu/ubler01from.html" } } @inproceedings{harabagiu00, authors = { Sanda Harabagiu and Marius Pasca and Steven Maiorano}, title = {Experiments with Open-Domain Textual Question Answering}, booktitle = {COLING-2000}, year = 2000, pages = {292--298} } %August 2000, Saarbruken Germany @inproceedings{chaudhri00using, author = {Vinay K. Chaudhri and Mark E. Stickel and J\'{e}r\^{o}me Tom\'{e}r\'{e} and Richard J. Waldinger}, title = {Using Prior Knowledge: Problems and Solutions}, booktitle = {Proc.\ of the Seventeenth National Conference on Artificial Intelligence and Twelfth Conference on Innovative Applications of Artificial Intelligence}, year = {2000}, isbn = {0-262-51112-6}, pages = {436--442}, publisher = {AAAI Press / The MIT Press}, } @book{korfhage97, title = {Information Storage and Retrieval}, author = {Robert R. Korfhage}, year = 1997, publisher = {Wiley} } @book{levelt89, title = {Speaking: From Intention to Articulation}, author = {Willem J. M. Levelt}, year = 1989, publisher = {MIT Press} } @incollection{pierrehumbert01, author = {Janet Pierrehumbert}, year = 2001, title = {Exemplar dynamics: Word frequency, lenition, and contrast}, booktitle = {Frequency effects and Emergent Grammar}, editor = {Joan Bybee and Paul Hopper}, address = {Amsterdam}, publisher = {John Benjamins}, pages = {137--157} } @article{kay99, author = {Paul Kay and Charles J. Fillmore}, year= 1999, title ={Grammatical constructions and linguistic generalizations: The What's X doing Y? construction}, journal = {Language}, volume= 75, number=1, pages={1--33} } @inproceedings{ kingsbury02adding, author = "P. Kingsbury and M. Palmer and M. Marcus", title = "Adding semantic annotation to the Penn TreeBank", booktitle = {Proc.\ of the Human Language Technology Conference (HLT'02)}, year = "2002", url = "citeseer.ist.psu.edu/kingsbury02adding.html" } % text = "Kingsbury, P., Palmer, M., Marcus, M.: Adding semantic annotation to the % Penn TreeBank. In: Proc.\ of the Human Language Technology Conference % (HLT'02). (2002)", @article{maedche01, author = {Alexander Maedche and Steffen Staab}, title = {Ontology Learning for the Semantic Web}, journal = {IEEE Intelligent Systems}, volume=16, number=2, pages={72--79}, year=2001 } @article{everett02, author = {John O. Everett and Daniel G. Bobrow and Reinhard Stolle and Richard Crouch and Valeria de Paiva and Cleo Condoravdi and Martin van den Berg and Livia Polanyi}, title = {Making ontologies work for resolving redundancies across documents}, journal = {Commun. ACM}, volume = {45}, number = {2}, year = {2002}, issn = {0001-0782}, pages = {55--60}, doi = {http://doi.acm.org/10.1145/503124.503149}, publisher = {ACM Press}, } @INproceedings{liu04, AUTHOR = {Hugo Liu and Push Singh}, TITLE = { Commonsense Reasoning in and over Natural Language}, PUBLISHER = "Springer", BOOKTITLE = {International Conference on Knowledge-Based Intelligent Information \& Engineering Systems (KES'2004)}, YEAR = 2004 } %Proceedings of the 8th %. Wellington, New Zealand. September %22-24. Lecture Notes in Artificial Intelligence, @inproceedings{dagan04, author={Ido Dagan and Oren Glickman}, title={Probabilistic textual entailment: {G}eneric applied modeling of language variability}, booktitle={Learning Methods for Text Understanding and Mining Workshop}, year=2004 } @book{grossman98, title = {Information Retrieval: Algorithms and Heuristics}, author = {David A. Grossman and Ophir Frieder}, publisher = {Kluwer}, year = 1998} } @book{baezayates99, title = {Modern Information Retrieval}, author = {Ricardo Baeza-Yates and Berthier Ribeiro-Neto}, Publisher = {Addison-Wesley}, address = {Harlow}, year = 1999 } @BOOK{chakrabarti02, AUTHOR = "Soumen Chakrabarti", title="Mining the Web: Analysis of Hypertext and Semi Structured Data", PUBLISHER = "Morgan Kaufman", YEAR = "2002" } @BOOK{belew01, AUTHOR = "Richard K. Belew", title="Finding out about", PUBLISHER = "Cambridge University Press", YEAR = "2001" } @inproceedings{ lin01dirt, author = "Dekang Lin and Patrick Pantel", title = "{DIRT} @{SBT}@discovery of inference rules from text", booktitle = "Knowledge Discovery and Data Mining", pages = "323-328", year = "2001", url = "citeseer.ist.psu.edu/lin01dirt.html" } @inproceedings{ pasca01high, author = "Marius Pasca and Sanda M. Harabagiu", title = "High Performance Question/Answering", booktitle = "SIGIR", pages = "366-374", year = "2001", url = "citeseer.ist.psu.edu/pasca01high.html" } @inproceedings{schweitzer2004, author = {Antje Schweitzer and Bernd M\"{o}bius}, title = {Exemplar-based production of prosody: Evidence from segment and syllable durations}, booktitle = {Proc.\ of the Speech Prosody Conference}, pages = {459--462}, year = 2004 } @inproceedings{saric2004, author = {Jasmin Saric and Lars Juhl Jensen and Rossitza Ouzounova and Isabel Rojas and Peer Bork}, title = {Extraction of Regulatory Gene Expression Networks from PubMed}, booktitle = "Proc.\ of ACL", year = 2004 } @inproceedings{Erk:2003:TRL, author = {Erk, Katrin and Kowalski, Andrea and Pado, Sebastian and Pinkal, Manfred}, title = {Towards a Resource for Lexical Semantics: A Large German Corpus with Extensive Semantic Annotation}, booktitle = {Proc.\ of ACL-03}, address = {Sapporo, Japan}, url = {http://www.coli.uni-sb.de/~erk/OnlinePapers/ACL03.ps}, year = {2003} } @inproceedings{fillmorebaker2001, author={Charles J. Fillmore and Collin F. Baker}, title={Frame Semantics for Text Understanding}, booktitle = {Proc.\ of WordNet and Other Lexical Resources Workshop, NAACL}, year = 2001 } @article{767378, author = {Lina Zhou and Dongsong Zhang}, title = {NLPIR: a theoretical framework for applying natural language processing to information retrieval}, journal = {J. Am. Soc. Inf. Sci. Technol.}, volume = {54}, number = {2}, year = {2003}, issn = {1532-2882}, pages = {115--123}, doi = {http://dx.doi.org/10.1002/asi.10193}, publisher = {John Wiley \& Sons, Inc.}, } @InProceedings{Pang+Lee:04a, author = {Bo Pang and Lillian Lee}, title = {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts}, booktitle = "Proc.\ of ACL", year = 2004 } @INPROCEEDINGS{grefEURALEX94 ,AUTHOR = "Gregory Grefenstette" ,TITLE = "Corpus-Derived First, Second and Third-Order Word Affinities" ,ADDRESS = "Amsterdam" ,YEAR = "1994" ,booktitle = "Sixth Euralex International Congress" } @BOOK{ksj64, AUTHOR = "Karen {Sp\"{a}rck Jones}", TITLE = "Synonymy and Semantic Classification", PUBLISHER = "Edinburgh University Press", ADDRESS = "Edinburgh", NOTE = " PhD thesis delivered by University of Cambridge in 1964", YEAR = "1986", ANNOTE = "" } @BOOK{trefethen97, AUTHOR = "Lloyd N. Trefethen and Bau, III, David", TITLE = "Numerical Linear Algebra", PUBLISHER = "SIAM", ADDRESS = "Philadelphia, PA", YEAR = "1997" } @INCOLLECTION{grefenACL96book, AUTHOR = "Gregory Grefenstette", TITLE = "Evaluation techniques for Automatic Semantic Extraction: Comparing Syntactic and Window-Based Approaches", PUBLISHER = "MIT Press", EDITOR = "Branimir Boguraev and James Pustejovsky", ADDRESS = "Cambridge, MA", BOOKTITLE = "Corpus Processing for Lexical Acquisition", YEAR = 1996, pages = {205--216} } % CHAPTER = 11, @BOOK{grefenbook94, AUTHOR = "Gregory Grefenstette", TITLE = "Explorations in Automatic Thesaurus Discovery", PUBLISHER = "Kluwer Academic Press", ADDRESS= "Boston", YEAR = 1994, ANNOTE = "PhD Thesis" } @book{procter78, editor={P. Procter}, title={Longman dictionary of contemporary English}, year=1978, publisher={Longman Group}, address={Harlow, England} } @book{frakes92, editor={William B. Frakes and Ricardo Baeza-Yates}, title={Information Retrieval: Data Structures and Algorithms}, year=1992, publisher={Prentice Hall}, address={Englewood Cliffs, NJ} } @book{butcher32, editor={S. H. Butcher}, title={Aristotle. Aristotle's Theory of Poetry and Fine Arts, with The Poetics}, year=1932, address={London}, note={4th edition}, publisher={Macmillan} } @book{winer71, author={B. J. Winer}, title={Statistical Principles in Experimental Design}, year=1971, address={New York NY}, note={2nd edition}, publisher={McGraw-Hill} } @book{todorov78, author={Tsvetan Todorov}, title={Les genres du discours}, year=1978, address={Paris}, publisher={Seuil} } @book{staiger59, author={Emil Staiger}, title={Grundbegriffe der Poetik}, year=1959, address={Zurich}, publisher={Atlantis Verlag} } @book{hernadi72, author={Paul Hernadi}, title={Beyond Genre}, year=1972, address={Ithaca and London}, publisher={Cornell University Press} } @book{dubrow82, author={Heather Dubrow}, title={Genre}, year=1982, address={London and New York}, publisher={Methuen} } @book{fowler82, author={Alistair Fowler}, title={Kinds of Literature}, year=1982, address={Cambridge MA}, publisher={Harvard University Press} } @book{frye57, author={Northrop Frye}, title={ The Anatomy of Criticism}, year=1957, address={Princeton}, publisher={Princeton University Press} } @book{splus91, author={Statistical Sciences}, year=1991, title={{S-PLUS} Reference Manual}, publisher={Statistical Sciences Inc.}, address={Seattle} } @incollection{McN, Author = "Peter McCullagh and John A. Nelder", Title = "Generalized Linear Models", Edition = "2nd", Chapter = "4", Pages = "101--123", Publisher = "Chapman and Hall", Year = 1989 } @incollection{dattola71, author={R. T. Dattola}, title={Experiments with a fast algorithm for automatic classification}, editor={Gerard Salton}, booktitle={The Smart Retrieval System -- Experiments in Automatic Document Processing}, publisher={Prentice-Hall}, address={Englewood Cliffs NJ}, year=1971, pages = {265--297} } @incollection{salton71cluster, year = 1971, author = {Gerard Salton}, title = {Cluster search strategies and the optimization of retrieval effectiveness}, editor = {Gerard Salton}, booktitle = {The {SMART} Retrieval System}, publisher = {Prentice-Hall}, address = {Englewood Cliffs NJ}, pages = {223--242} } @book{salton71smart, editor={Gerard Salton}, title={The {SMART} Retrieval System -- Experiments in Automatic Document Processing}, publisher={Prentice-Hall}, address={Englewood Cliffs, NJ}, year=1971 } @incollection{hobbes08, author={Thomas Hobbes}, title={The Answer of Mr {H}obbes to {S}ir {W}illiam {D}avenant's Preface before {G}ondibert}, editor={J.E. Spigarn}, booktitle={Critical Essays of the Seventeenth Century}, publisher={The Clarendon Press}, address={Oxford}, year=1908 } @article{hull96, author = {David Hull}, title = {Stemming algorithms -- {A} case study for detailed evaluation}, journal = {Journal of the American Society for Information Science}, volume = 47, number = 1, pages = {70--84}, year = 1996 } @article{ksj91, author = {Karen {Sp\"{a}rck Jones}}, title = {Notes and references on early classification work}, journal = {ACM SIGIR Forum}, volume = 25, number = 1, pages = {10--17}, year = 1991 } @article{wongyao91, author = {S. K. M. Wong and Y. Y. Yao}, title = {An Information-Theoretic Measure of Term Specificity}, journal = {Journal of the American Society for Information Science}, volume = 43, number = 1, pages = {54--61}, year = 1992 } @article{youmans91, author = {Gilbert Youmans}, title = {A new tool for discourse analysis: {T}he vocabulary-management profile}, journal = {Language}, volume = 67, number = 4, pages = {763--789}, year = 1991 } @article{morris91, author = {Jane Morris and Graeme Hirst}, title = {Lexical cohesion computed by thesaural relations as an indicator of the structure of text}, journal = {Computational Linguistics}, volume = 17, number = 1, pages = {21--48}, year = 1991 } @article{kruskal64a, author = {J. B. Kruskal}, title = {Multidimensional scaling by optimizing goodness of fit to a nonmetric hypothesis}, journal = {Psychometrika}, volume = 29, pages = {1--27}, year = 1964 } @article{kruskal64b, author = {J. B. Kruskal}, title = {Nonmetric Multidimensional scaling: {A} numerical method}, journal = {Psychometrika}, volume = 29, pages = {115--129}, year = 1964 } @article{biber86, author = {Douglas Biber}, title = {Spoken and written textual dimensions in {E}nglish: {R}esolving the contradictory findings}, journal = {Language}, volume = 62, number = 2, pages = {384--413}, year = 1986 } @article{katz96, author = {Slava M. Katz}, title = {Distribution of content words and phrases in text and language modelling}, journal = {Natural Language Engineering}, volume = 2, pages = {15--59}, year = 1996 } @article{biber92, author = {Douglas Biber}, title = {The multidimensional approach to linguistic analyses of genre variation: {A}n overview of methodology and finding}, journal = {Computers in the Humanities}, volume = 26, number = {5--6}, pages = {331--347}, year = 1992 } @article{bookstein75, title = {A Decision Theoretic Foundation for Indexing}, author = {Abraham Bookstein and Don R. Swanson}, journal = {Journal of the American Society for Information Science}, pages = {45--50}, volume = 26, number = 1, year = 1975 } @article{harter75, title = {A Probabilistic Approach to Automatic Keyword Indexing: Part {II}. An Algorithm for Probabilistic Indexing}, author = {Steve Harter}, journal = {Journal of the American Society for Information Science}, pages = {280--289}, volume = 26, number = 4, year = 1975 } @inproceedings{karlgren94, author = {Jussi Karlgren and Douglass Cutting}, year = 1994, title = {Recognizing Text Genres with Simple Metrics Using Discriminant Analysis}, booktitle = {Proc.\ of Coling 94}, address = {Kyoto} } @inproceedings{pedersen97, author = {Ted Pedersen and Rebecca Bruce}, year = 1997, title = {Distinguishing Word Senses in Untagged Text}, booktitle = {EMNLP 2}, pages = {197--207} } % booktitle = {Proc.\ of the Second Conference on Empirical Methods % in Natural Language Processing}, % publisher = {Association for Computational Linguistics}, % address = {Somerset NJ} @inproceedings{ratnaparkhi96, author = {Adwait Ratnaparkhi}, year = 1996, booktitle = {EMNLP 1}, title = {A Maximum Entropy Model for Part-Of-Speech Tagging}, pages = {133--142} } % booktitle = {Proc.\ of the Conference on Empirical Methods % in Natural Language Processing}, % publisher = {Association for Computational Linguistics}, % address = {Somerset NJ}, @book{fk82, author={W.N. Francis and F. Ku\u{c}era}, year=1982, title={Frequency Analysis of English Usage}, publisher={Houghton Mifflin}, address={Boston} } @book{biber95, author={Douglas Biber}, year=1995, title={Dimensions of Register Variation: A Cross-Linguistic Comparison}, publisher={Cambridge University Press}, address={Cambridge UK} } @book{biber88, author={Douglas Biber}, year=1988, title={Variation across Speech and Writing}, publisher={Cambridge University Press}, address={Cambrdige UK} } @book{nunberg90, author={Geoffrey Nunberg}, year=1990, title={The Linguistics of Punctuation}, publisher={CSLI Publications}, address={Stanford, CA} } article{churchgale95b, author = "Kenneth Church and William Gale", title = "{P}oisson Mixtures", year = "1995", journal = "Natural Language Engineering", volume = "1", number = 2, pages = "163--190" } @article{gale92, author = {William A. Gale and Kenneth W. Church and David Yarowsky}, title = {A Method for Disambiguating Word Senses in a Large Corpus}, journal = {Computers and the Humanities}, volume = 26, pages = {415--439}, year = 1992 } @article{black88, author = {Ezra Black}, title = {An Experiment in Computational Discrimination of {E}nglish Word Senses}, journal = {IBM Journal of Research and Development}, volume = 32, pages = {185--194}, year = 1988 } @article{choueka85, author = {Yaacov Choueka and Serge Lusignan}, title = {Disambiguation by Short Contexts}, journal = {Computers and the Humanities}, volume = 19, pages = {147--158}, year = 1985 } @article{fgg91, author = "Cynthia Fisher and Henry Gleitman and Lila R. Gleitman", title = {On the Semantic Content of Subcategorization Frames}, journal = {Cognitive Psychology}, volume = 23, year = 1991, pages = {331--392} } %% keys = "Acq:Dict Acq:Stat KB:Lex" @article{wilks90, author = "Yorick A. Wilks and Dan C. Fass and Cheng-ming Guo and James E. McDonald and Tony Plate and Brian M. Slator", title = "Providing Machine Tractable Dictionary Tools", journal = "Journal of Computers and Translation", volume = 2, year = 1990 } %% keys = "Acq:Dict Acq:Stat KB:Lex" @article{lin94, author = "Charles X. Ling", title = "Learning the Past Tense of English Verbs: The Symbolic Pattern Associator vs.\ Connectionist Models", journal = "Journal of Artificial Intelligence Research", volume = 1, pages = {209--229}, year = 1994 } @article{mbfgm90, author = "George A. Miller and Richard Beckwith and Christiane Fellbaum and Derek Gross and Katherine J. Miller", title = "Introduction to {WordNet}: An On-line Lexical Database", journal = "Journal of Lexicography", volume = 3, number = 4, pages = "235--244", year = 1990 } @book{covington94, author = {Michael A. Covington}, year = 1994, title = {Natural language processing for {P}rolog programmers}, publisher = {Prentice Hall}, address = {Englewood Cliffs NJ} } @book{gazdar89, author = {Gerald Gazdar and Chris Mellish}, year = 1989, title = {Natural language processing in {LISP}}, publisher = {Addison-Wesley}, address = {Wokingham, England} } @book{pereira87, author = {Fernando C. N. Pereira and Stuart M. Shieber}, year = 1987, title = {Prolog and natural-language analysis}, publisher = {CSLI}, address = {Stanford CA} } @book{winograd83, author = {Terry Winograd}, year = 1983, title = {Language as a cognitive process}, publisher = {Addison Wesley}, address = {Reading MA} } @book{hinrichallen95, author = {James Allen}, year = 1995, title = {Natural Language Understanding}, publisher = {Benjamin/Cummings}, address = {Redwood City, CA} } @book{lyons69, author = {John Lyons}, year = 1969, title = {Introduction to theoretical linguistics}, publisher = {Cambridge University Press} } @book{breiman84, author = {L. Breiman and J. H. Friedman and R. A. Olshen and C. J. Stone}, year = 1984, title = {Classification and Regression Trees}, publisher = {Wadsworth International Group}, address = {Belmont, CA} } @book{newcollegiate73, editor = {Henry Bosley Woolf}, year = 1973, title = {Webster's new collegiate dictionary}, publisher = {{G. \& C.} {Merriam Co.}}, address = {Springfield, MA} } @book{mor94, author = {Herbert Charles Morton}, year = 1994, title = {The story of {W}ebster's {t}hird: {P}hilip {G}ove's controversial dictionary}, publisher = {Cambridge University Press}, address = {Cambridge, England} } @book{zipf49human, author = {George Kingsley Zipf}, year = 1949, title = {Human Behavior and the Principle of Least Effort}, publisher = {Addison-Wesley Press}, address = {Cambridge MA} } @book{trask93, author = {Robert Lawrence Trask}, year = 1993, title = {A dictionary of grammatical terms in linguistics}, publisher = {Routledge}, address = {London} } @book{dh73, author = {Richard O. Duda and Peter E. Hart}, year = 1973, title = {Pattern classification and scene analysis}, publisher = {Wiley}, address = {New York} } @book{bbi93, author = {Morton Benson and Evelyn Benson and Robert Ilson}, year = 1993, title = {The BBI combinatory dictionary of English}, publisher = {John Benjamins}, address = {Amsterdam} } % publisher = {John Benjamins Publishing Company}, % address = {Amsterdam/Philadelphia} @book{oed74, author = {A. S. Hornby}, year = 1974, title = {Oxford Advanced Learner's Dictionary of Current English}, publisher = {Oxford University Press}, address = {Oxford}, note = {Third Edition} } @book{ogr87, author={William O'Grady}, title={Principles of Grammar and Learning}, publisher={The University of Chicago Press}, year=1987, address={Chicago and London} } @book{bb89, author = {Bran Boguraev and Ted Briscoe}, title = {Computational Lexicography for Natural Language Processing}, year = 1989, publisher = {Longman}, address = {London} } % address = {London; New York} @book{gre74, author = {Georgia M. Green}, title = {Semantics and Syntactic Regularity}, year = 1974, publisher = {Indiana University Press}, address = {Bloomington; London} } @book{lev93, author = {Beth Levin}, title = {English Verb Classes and Alternations}, year = 1993, publisher = {The University of Chicago Press}, address = {Chicago} } @book{hor75, author = {A. S. Hornby}, title = {Guide to Patterns and Usage in English}, year = 1975, publisher = {Oxford University Press}, address = {London} } @book{black49, author = {Max Black}, title = {Language and philosophy}, year = 1949, publisher = {Cornell University Press}, address = {Ithaca NY} } @incollection{walker87, year = 1987, author = {Donald E. Walker}, title = {Knowledge resource tools for accessing large text files}, editor = {Sergei Nirenburg}, booktitle = {Machine Translation: Theoretical and methodological issues}, publisher = {Cambridge University Press}, address = {Cambridge}, pages = {247--261} } @incollection{atk93, year = 1993, author = {B. T. Sue Atkins}, title = {The contribution of lexicography}, editor = {Madeleine Bates and Ralph M. Weischedel}, booktitle = {Challenges in natural language processing}, publisher = {Cambridge University Press}, address = {Cambridge}, pages = {37--75} } @incollection{zernik90, year = 1990, author = {Uri Zernik}, title = {Tagging Word Senses in Corpus: The Needle in the Haystack Revisited}, editor = {Paul Schafran Jacobs}, booktitle = {Text Based Intelligent Systems: Current Research in Text Analysis, Information Extraction, and Retrieval}, publisher = {GE Research and Development Center}, pages = {25--29} } @incollection{clark87, year = 1987, author = {Eve V. Clark}, title = {The principle of contrast: {A} constraint on language acquisition}, editor = {Brian MacWhinney}, booktitle = {Mechanisms of language acquisition}, publisher = {Lawrence Erlbaum}, address = {Hillsdale NJ}, pages = {1--33} } @incollection{sch85, year = 1985, author = {Paul Schachter}, booktitle = {Language typology and syntactic description: Clause structure}, volume = 1, editor = {Timothy Shopen}, title = {Parts-of-speech systems}, pages = {3--61}, publisher = {Cambridge University Press}, address = {Cambridge} } @incollection{was81, year = 1981, author = {Thomas Wasow}, title = {Comments on the paper by {B}aker.}, editor = {C. L. Baker and John J. McCarthy}, booktitle = {The logical problem of language acquisition}, publisher = {MIT Press}, address = {Cambridge MA}, pages = {325--329} } @incollection{bak81, year = 1981, author = {C. L. Baker}, title = {Learnability and the English Auxiliary System}, editor = {C. L. Baker and John J. McCarthy}, booktitle = {The logical problem of language acquisition}, publisher = {MIT Press}, address = {Cambridge MA}, pages = {296--324} } @article{cru82, author = {D. Alan Cruse}, year = 1982, title = {On lexical ambiguity}, journal = {Nottingham Linguistic Circular}, note = {Cited in \cite{gee93}}, volume = {11}, number =2, pages = {65--80} } @article{ste91, author = {Mark Steedman}, year = 1991, title = {Structure and Intonation}, journal = {Language}, volume = 67, pages = {260--296} } @article{mn81, author = {J. Morgan and E. Newport}, year = 1981, journal = {Journal of Verbal Learning and Verbal Behaviour}, title = {The Role of Constituent Structure in the Induction of an Artificial Language}, volume = 20, pages = {67--85} } @article{mw84, author = {Irene Mazurkewich and Lydia White}, year = 1984, title = {The acquisition of the dative alternation: {U}nlearning overgeneralizations}, journal = {Cognition}, volume = 16, pages = {261--283} } @article{pinker79, author = {Steven Pinker}, year = 1979, title = {Formal models of language learning}, journal = {Cognition}, volume = 1, pages = {217--283} } @article{bak79, author = {C. L. B