9. Datos no estructurados

 /*

                         * JUIRDBC.java

                         *

                         *

                         *

                         * To change this template, choose Tools | Template Manager

                         * and open the template in the editor.

                         */

                        

                        package juirdbc;

                        

                        

                        import java.io.BufferedReader;

                        import java.io.File;

                        import java.io.FileInputStream;

                        import java.io.FileReader;

                        import java.io.IOException;

                        import java.io.InputStreamReader;

                        import java.io.Reader;

                        import java.util.Vector;

                        import org.apache.lucene.analysis.standard.StandardAnalyzer;

                        import org.apache.lucene.document.DateTools;

                        import org.apache.lucene.document.Field;

                        import org.apache.lucene.index.IndexReader;

                        import org.apache.lucene.index.IndexWriter;

                        import org.apache.lucene.index.Term;

                        import org.apache.lucene.index.TermDocs;

                        import org.apache.lucene.index.TermEnum;

                        import org.apache.lucene.queryParser.QueryParser;

                        import org.apache.lucene.analysis.Analyzer;

                        import org.apache.lucene.search.Hits;

                        import org.apache.lucene.search.IndexSearcher;

                        import org.apache.lucene.search.Query;

                        import org.apache.lucene.search.Searcher;

                        

                        

                        /**

                         *  Class to serve between applications and index files to use retrieval models

                         *  with or without Hermes easier

                         *

                         *@author     carlos

                         *@created    April 16, 2003

                         */

                        public class JUIRDBC {

                        

                            private String indexfile;

                            private boolean store;

                        

                        

                        

                            /**

                             *  Constructor for the JUIRDBC object

                             *

                             *@param  indexfile  Index file to store/retrieve terms information

                             *@param  store      true store full documents too, false just store terms

                             */

                            public JUIRDBC(String indexfile, boolean store) {

                                this.indexfile = indexfile;

                                this.store = store;

                        

                            }

                        

                        

                            

                            

                        

                        

                            /**

                             *  Gets a lucene Document of the specified id, A document contain the some

                             *  section, identifies by a name This means that even we chose not to store

                             *  the document we will have a result

                             *

                             *@param  id               Unique identifier of the resource

                             *@return                  org.apache.lucene.document.Document value, null

                             *      if resource doesnt exist

                             *@exception  IOException  Exception

                             */

                            public org.apache.lucene.document.Document getResource(String id) throws IOException {

                                IndexReader ireader = IndexReader.open(indexfile);

                                TermDocs tdocs = ireader.termDocs(new Term("id", id));

                        

                                if (tdocs.next()) {

                                    return ireader.document(tdocs.doc());

                                }

                        

                                return null;

                            }

                        

                        

                            /**

                             *  Gets a list of all the resources(ids) in current index

                             *

                             *@return                  vector containing ids

                             *@exception  IOException  Exception

                             */

                            public Vector getListResources() throws IOException {

                                Vector vector = new Vector();

                                IndexReader r = IndexReader.open(indexfile);

                        

                                int num = r.numDocs();

                                for (int i = 0; i < num; i++) {

                                    if (!r.isDeleted(i)) {

                                        org.apache.lucene.document.Document d = r.document(i);

                                        vector.add(d.get("id"));

                        

                                    }

                                }

                                TermEnum te = r.terms();

                                int w = 0;

                                while (te.next()) {

                                    w++;

                                }

                                System.out.println(w);

                                r.close();

                                return vector;

                            }

                        

                        

                            /**

                             *  Establish connection with a index service Currently just initialize the

                             *  file because there is no such service

                             *

                             *@exception  IOException  Description of Exception

                             */

                            public void openConnection() throws IOException {

                                if (IndexReader.indexExists(indexfile) == false) {

                                    IndexWriter writer = new IndexWriter(indexfile, new StandardAnalyzer(), true);

                                    writer.close();

                                }

                            }

                        

                        

                            /**

                             *  Termines communication with index service

                             *

                             *@exception  IOException  Exception

                             */

                            public void closeConnection() throws IOException { }

                        

                        

                        

                            /**

                             *  Adds a resource (or terms of) to the index file

                             *

                             *@param  id               Unique identifier of the resource

                             *@param  file             Resource/document to be added

                             *@exception  IOException  Exception

                             */

                            public void addResource(String id, File file) throws IOException {

                        

                                IndexWriter writer = new IndexWriter(indexfile, new StandardAnalyzer(), false);

                                writer.addDocument(fileToDocument(id, file));

                                writer.optimize();

                                writer.close();

                        

                            }

                        

                        

                            /**

                             *  Removes a resource (and/or its terms) from the index file

                             *

                             *@param  id               id of document to be deleted

                             *@exception  IOException  Exception

                             */

                            public void deleteResource(String id) throws IOException {

                                IndexReader ireader = IndexReader.open(indexfile);

                                TermDocs tdocs = ireader.termDocs(new Term("id", id));

                        

                                if (tdocs.next()) {

                                    ireader.deleteDocument(tdocs.doc());

                                }

                        

                                ireader.close();

                        

                            }

                        

                        

                            /**

                             *  Search a query with lucene

                             *

                             *@param  line           string of terms to be searched

                             *@return                lucene hits

                             *@exception  Exception  Exception

                             */

                            public Hits search(String line) throws Exception {

                                Searcher searcher = new IndexSearcher(indexfile);

                                Analyzer analyzer = new StandardAnalyzer();

                        

                                QueryParser parser=new QueryParser("contents",analyzer);

                                Query query = parser.parse(line);

                                System.out.println("Searching for: " + query.toString("contents"));

                        

                                return searcher.search(query);

                            }

                        

                        

                        

                            /**

                             *  Converts a file to Lucene's specification of Document The document

                             *  contains two fields: id - the document id contents - file content

                             *

                             *@param  id                                 identifier of current document

                             *@param  f                                  file to be added

                             *@return                                    Document Value

                             *@exception  java.io.FileNotFoundException  Exception

                             */

                            private org.apache.lucene.document.Document fileToDocument(String id, File f)

                                     throws java.io.FileNotFoundException {

                        

                                // make a new, empty document

                                org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

                        

                                doc.add(new Field("id", id,Field.Store.YES,Field.Index.TOKENIZED));

                                // Add the path of the file as a field named "path".  Use a Text field, so

                                // that the index stores the path, and so that the path is searchable

                                doc.add(new Field("path", f.getPath(),Field.Store.YES,Field.Index.TOKENIZED));

                        

                                // Add the last modified date of the file a field named "modified".  Use a

                                // Keyword field, so that it's searchable, but so that no attempt is made

                                // to tokenize the field into words.

                                doc.add(new Field("modified",

                                                  DateTools.timeToString(f.lastModified(),DateTools.Resolution.YEAR),

                                                  Field.Store.YES,

                                                  Field.Index.TOKENIZED));

                        

                                // Add the contents of the file a field named "contents".  Use a Text

                                // field, specifying a Reader, so that the text of the file is tokenized.

                                // ?? why doesn't FileReader work here ??

                                FileInputStream is = new FileInputStream(f);

                                Reader reader = new BufferedReader(new InputStreamReader(is));

                                try {

                                    char[] documento = new char[new Long(f.length()).intValue()];

                                    FileReader freader = new FileReader(f);

                                    freader.read(documento, 0, new Long(f.length()).intValue());

                                    if (store) {

                                        doc.add(new Field("contents", 

                                                          new String(documento), 

                                                          Field.Store.YES,

                                                          Field.Index.TOKENIZED));

                                    } else {

                                        doc.add(new Field("contents", reader));

                                    }

                        

                                } catch (Exception e) {

                                }

                                // return the document

                                return doc;

                            }

                        

                        }
9. Datos no estructurados

9.1 Introducción

Existen 2 tipos de tareas principales en la recuperación de información:

9.2 Modelos de Recuperación

9.2.1 Clasificación

9.2.2 Modelo Booleano

De este modelo se pueden destacar los siguientes puntos:

¿Por qué es malo?

¿Por qué es popular?

9.2.3 Modelo de Espacios Vectoriales

8.2.3.1 Introducción

8.2.3.2 Algoritmo

Conceptos Iniciales:

Teoría de Vectores

9.2.3.3 Herramientas

Core Classes