package org.sf.mustru.test;
import com.aliasi.util.Files;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;
import junit.framework.TestCase;
import org.apache.log4j.PropertyConfigurator;
import org.sf.mustru.docs.IndexableDoc;
import org.sf.mustru.filters.StarHandler;
import org.sf.mustru.utils.Constants;
import org.sf.mustru.utils.LingpipeTools;
import org.sf.mustru.utils.StringTools;
public class TestSentenceExtraction extends TestCase
{
public void setUp()
{ PropertyConfigurator.configure (Constants.LOG4J_FILE); }
public void ttestPdfFile()
{
System.out.println("Started testPdfFile");
Properties props = null; StarHandler sh = null;
try { props = new Properties(); props.load(new FileInputStream(Constants.FILTER_FILE)); sh = new StarHandler(props); }
catch (IOException e) { throw new RuntimeException("Could not read filtersFile + " + e.getMessage() ); }
String filename = "/home/manuk/html/akr/ebooks/MYSQLmanual-a4.pdf";
IndexableDoc doc = new IndexableDoc();
sh.getDocument(filename, doc);
int contentSizeLimit = Constants.DOC_LENGTH_MAXLIMIT;
int textSize = doc.getContents().length();
if (textSize > contentSizeLimit )
doc.setContents( (new StringBuffer( doc.getContents().substring(0, contentSizeLimit ) ) ) );
LingpipeTools sentTools = new LingpipeTools();
sentTools.buildSentences(doc.getContents().toString()); String sentence; int i = 0;
while ( (sentence = sentTools.nextSentence()) != null)
{ System.out.println(" " + i + "( " + sentence.length() + " ) : " + sentence); i++; }
System.out.println("Ended testPdfFile");
}
public void testTxtFile()
{
File file = new File(Constants.TESTINGDIR + File.separator + "samples" + File.separator + "sentences1.txt");
String text = "";
try { text = Files.readFromFile(file); StringTools.filterChars(text); }
catch (IOException ie) { System.out.println("Could not read the text" + ie.getMessage() ); }
//text = "some junk.............................. and more junk,,,,,,,,,,,,,,,,,,,,,,, dnandn ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,@@@@@@@@@@@@@@@ fffffffffffffff. This is the end of the sentence.";
//text = "This is a group command. Here is another group command. ";
text = "SUKRUPA, is a sensitive response to the socio-economic disparities and the banes of caste, class and gender those have been consistently hampering the growth of our nation. Compelled by compassion for the cause of the suffering masses, especially the women and children of the socially and economically disadvantaged communities, a mother and daughter duo founded SUKRUPA in the year 2000. The legal identity of SUKRUPA was no more than the culmination of a long cherished dream and desire of the two visionaries Mrs. Suguna, a tutor by profession with extensive experience in women and child development, together with her daughter Ms. Krupa an entrepreneur, who had made a mark in the global market for over a decade in the granite industry. Identifying themselves with the poorest of the poor together with a few like-minded people, they have ventured into the noble mission of nation building." +
" Accredited with provisions of exemption from Income Tax Act under Sections 12 A and 80 G, SUKRUPA has its operational base strategically located at Bhuvaneshwari Slum. With Mrs. Suguna being the back bone, Ms. Krupa manages the organization in the capacity of its Chief Functionary. " +
" VISION: SUKRUPA envisions a new social order founded on social justice and human welfare, where all people could cohesively grow together to become economically viable and socially dignified " +
" MISSION: SUKRUPA, in order to realize its vision, endeavours to ensure the less endowed enjoy equal opportunities through optimisation of their potentials with optimum utilization of available resources and thereby enable them to take charge of their own development through organized and planned processes for equitable distribution of power, status and resources; dispelling all forms of disparities. " +
" SUKRUPA's CONCERNS: SUKRUPA and the area of its concern in the State of Karnataka in Southern India encompasses every conceivable element of development and its priorities are rural and urban poor, dalits, women and children. As for the geographic area, presently it is concerned about a cluster of slums in Bangalore city and a few villages in the rural suburb. " +
" The state of affairs with regard to children is a compelling factor, they being 40 % of our population and with every third household having a working child and every fourth child in the age group of 5 to 15 being employed. The women deemed as lesser beings are the most exploited in all walks of life and need to be empowered to enjoy their rightful status in the family and in the community.";
System.out.println("INPUT TEXT: ");
System.out.println(text);
LingpipeTools sentTools = new LingpipeTools();
sentTools.setMaxCharsPerSentence(250); sentTools.setMinCharsPerSentence(150);
sentTools.buildSentences(text); String sentence; int i = 0;
while ( (sentence = sentTools.nextSentence()) != null)
{ System.out.println(" " + i + "( " + sentence.length() + " ) : " + sentence); i++; }
}
}