In [1]:
from glob import glob
import os.path
import re

from otdet.detector import OOTDetector
from otdet.feature_extraction import CountVectorizerWrapper

In [2]:
files = glob('datasets/macforums/245842__New-to-macforums-introduce-yourself/*.txt')

In [3]:
def post_num(filename):
    m = re.search('-(\d+)\.', filename)
    return int(m.group(1))

In [4]:
files.sort(key=post_num)

In [5]:
files


Out[5]:
['datasets/macforums/245842__New-to-macforums-introduce-yourself/post-0.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-1.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-2.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-3.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-4.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-5.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-6.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-7.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-8.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-9.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-10.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-11.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-12.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-13.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-14.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-15.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-16.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-17.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-18.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-19.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-20.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-21.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-22.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-23.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-24.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-25.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-26.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-27.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-28.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-29.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-30.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-31.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-32.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-33.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-34.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-35.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-36.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-37.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-38.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-39.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-40.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-41.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-42.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-43.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-44.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-45.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-46.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-47.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-48.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-49.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-50.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-51.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-52.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-53.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-54.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-55.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-56.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-57.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-58.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-59.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-60.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-61.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-62.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-63.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-64.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-65.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-66.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-67.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-68.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-69.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-70.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-71.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-72.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-73.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-74.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-75.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-76.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-77.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-78.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-79.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-80.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-81.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-82.oot.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-83.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-84.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-85.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-86.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-87.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-88.txt',
 'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-89.txt']

In [6]:
documents = []
for file_ in files:
    with open(file_) as f:
        documents.append(f.read())

In [7]:
documents


Out[7]:
['Take a second and say hi so we can welcome you to the Mac-Forums community!\n',
 'Hello. \nMy Name is Scott Sullivan. I have already made a post or two on here and look forward to hearing from and helping some of you.\n\nA small bit about myself, 19 years old. Love Gaming and Xbox and I have an iPhone4.\nI use PC (for now) but will be getting a Mac in the very near future.\n\nIf i find advice to be useful, or just polite, then I will use the "Rep" system.\n\nLook forward to hearing from all of you in the future.\n',
 "Hey guys,\n\nMy name is Sushil, I m 21 from India. Yesterday I got my MacBookPro 13.3.\nThis is my first MAC experience as I was using PC till now. Moving from PC to Mac is hard as they say. I don't know all the cool stuff about Mac. It would be very nice if some one can tell me about starting out with mac or anything else.\n",
 'Welcome aboard guys. Lots of helpful folks here.\n',
 "Hi everyone, I currently use a PC (alway have) but I am thinking of defecting to the Mac world (hopefully as son as I can). Don't have any experiance with a Mac, would anyone recommend purchasing a reasonably priced machine from amazon or ebay to get started? Or should I avoid these types of site?\n",
 'Welcome to all the new members. \n',
 'Hello kc7479.\n\nI suggest buying a brand new Mac purely for the "kick" you would get out of personalizing it yourself. \nHowever if money is major factor in your final decision then i\'m sure there is quite a few good machines on the likes of eBay and Amazon. \n\nThe above is just my opinion it would be what I personally would do.\n',
 "When looking for that Mac don't forget to look at the refurbished models at the Apple online store. There are sometimes some pretty good deals there.\n",
 "And since this is the introduction thread, let's move the off-topic discussion to it's own thread to help keep it making sense. \n",
 "Hi my name is Nik, Look forward to saying Hello to New Friend's.\n",
 'Hi, my name is Dan. About 5 weeks ago, I purchased my first Mac. It is the 21.5” iMac Desktop with 2.7 GHz intel Core i5. Also, I am now using Lion on my iMac. It came with Snow Leopard. I have been using PCs since 1981 when a relative gave me a used PC Jr. So far I am loving my iMac. I am looking forward to gaining more insight into my iMac and possibly helping others when I get more proficient. \n\nSunrock\n',
 "Good morning! I'm new here!\n\nI'm on my third Mac since 1992 ... had a Quadra 605, then a couple of PowerPCs, now a Mac Mini which I've had since 2007. Yes, that's four, but the second PowerPC only lasted about a week and it was almost exactly like the other one so I don't count it separately. \n\nI'm running 10.4.11 (Intel) and I haven't installed any anti-virus programs. I'm trying to decide whether to upgrade; and if so, whether to go to SL or to jump to Lion. I guess I should find out if I meet the minimum system requirements for either ... I'm also more than ready for a new smartphone and am debating between the Droid and the iPhone. \n\nHave a great day!!\n",
 'Welcome aboard! Hope you enjoy your stay.\n',
 'Hi everyone,\n\nI\'m Nils. I\'m 16 years old and I live in the Netherlands.\nI am already very familiar with Mac OS since friends of mine have several Macs and iOS-devices which I use on regular basis.\n\nThe first Mac I\'ve ever had was a PowerMac G4 ("Yikes!") which I got for free about a year ago. It wasn\'t meant to be my main computer because its specs aren\'t really modern - it came with Mac OS 9.2.2 and it couldn\'t run Tiger very smooth.\n\nAbout a month ago, I\'ve bought myself a second-hand iMac G5 of the third generation (iSight) and according to Wikipedia, it was the most expensive one you could buy back then. Its specifications are nice, and despite its age I can use it for everything I want to do on my computer - which is a lot compared to what other 16-year-olds do with their computers.\n\nI came to this forum because I want to be even more familiar with Macs and I\'d like to help others with getting familiar with them too.\n',
 "I am new to a Mac snow leopard. I got myself geared up with my old PC and am finding the transition to Mac very hard. There is so much new to learn and I can't seem to do the stupidist things. I inserted a memory stick and I can't find it on my Mac! How stupid is that. Never mind, perhaps if I share my knowledge we can laugh it off together.http://www.mac-forums.com/forums/mac...ilies/Wink.png\n",
 "I have a 6 month old WD external hard drive that has previously worked on my Mac Book Pro. However today it is unrecognizable in on my desk top and in the disc utility. \n\nAny info/insight would be greatly appreciated. The lights are blinking on the external and it sounds like it's working fine.\n\nThanks.\n",
 "I feel like an idiot about asking a question about where to ask a question, but bear with me I'm new to mac and this forum. \n\nI'm guessing I should ask it in the apps and games section, but I don't want to be trolled all over for asking in the wrong zone. \n\nand hi im new to mac my name is Aub. :]\n",
 "Although I think you've posted this question in the wrong section, I'm still going to answer it. \n\nIf the desktop is a Windows-machine, you might have formatted the HDD in HFS+, which is the standard file system for Macs (such as NTFS / FAT32 for Windows) and HFS+ isn't readable in Windows.\n\nYou could check in Disk Utility on your MacBook Pro how the HDD is formatted.\n",
 "I decided to come here to look for some troubleshooting advice (not that we need much on our Macs!) and to ask a few questions about short cuts etc.\nI'm 40 something and somewhat new to Apple products (used them for approx. 2 years).\n",
 'Welcome to all the new folks!\n',
 'Hello All,\n\nJust signed up. Have been a PC user for more than 2 decades. Want to switch over and have 2 Macbook Pros on order. Hopefully, I will find all my answers here. Thank you for a wonderful community (from what I have read).\n',
 'Welcome and enjoy the forums ... very helpful and knowledgeable people around here.\n\nCheers ... McBie\n',
 'Welcome all yé newcomers ! The first rule of Fight clu.... oh wait, wrong place, sorry. In any case, just a tip for you guys. Most general questions have most likely been asked already. If you search and find a thread based upon the topic you were curious about, chances are that you\'ll have an easier time getting an basic understanding of the issue, as well as a faster reply. And of course, you\'ll have a lot less responses such as: "please search the forums, this topic has been beaten to death" etc.. \n\nAnd if you can\'t find what you\'re looking for, then PM schweb at least 30 times a day about it, he LOVES that. \n\nMost of all, have fun! \n\nDoug\n',
 "Hi everyone! My name is Exequiel. I'm 19 years old.\n\nMy main interests are video editing, 3D modeling, photo retouch, iOS app development. \n\nI own an iMac (Later 2009) and an iPad 1G\n\nI'm open to help anyone in anything possible.\n",
 'Greetings everybody.\nNice to be here.\n\nHope to learn from your community.\n\nBest regards \nGeorge\n',
 'Welcome aboard everyone!! \n',
 'Just joined up today.\nBought a MacBook Pro last week.\n\nI love playing piano/keyboards. But I have no computer skills.\nI am looking for any advice with regards to Music Software such as Cubase, Logic, etc, etc.\n\nHopefully I will learn heaps from this forum.\n\nRegards, Justin.\n',
 "Don't make me cut you.\n",
 'Hi, I recently bought a MacBook Air second hand from a friend and I love it! I have a few questions about issues with owning a second hand mac (like mobileme accounts) where in the forum should I ask these questions?\n',
 "Hi am Dom, 17 year old from the UK and shall be owning a Mac book pro very very soon and shall be changing from windows to mac. i look forward to everyone's help \n",
 'This is relevant.\n\nTake a look at the names and descriptions of the various forums. Try your best to put it in the most appropriate place (some questions can go in multiple places so it can be tricky sometimes).\n',
 'I will apparently be the elder on this list... From the USA, NYC exactly. But I travel and teach. And I am trying to connect to as many people as I can so I am exploring social networking, programs for audio and video recording, webinars, email campaigning, photoshop etc. Its a lot of learning and having a place brimming with experts is wonderful for those tricky moments or when i will need a mentor.\n\nI was gifted a great computer with some amazing programs. In a year, I have run out of space on my hard drive because my mac has become my home in many ways.\n\nI hope to meet many of you on my journey since I have a lot to learn. I will begin posting immediately and update my avatar etc so you can see my stable of mac products. I am what would be called moderate :-) compared to those friends of mine who live mac (a few programmers I call my best friends).\n\nKalaki\n',
 "G'day\n\nmacgrunt : publishing industry production grunt : interested in workflow automation and optimisation : particularly applescript\n\notherwise a neo-luddite :-)+)\n\nbrisbane : queensland : australia\n\n\nm.\n",
 "hi everyone.. \n\nI'm James and new to this forum.. I'm a certified apple fanatic.. that's all I can say..  \n",
 "Hi. My nickname's Raquinator. I'm a rising 2nd year college student, who just recently has been doing some work on my MacBook. I'm looking into Chemical Engineering as a major and hoping to eventually become a doctor. I made the switch to Mac about a year and a half ago, and i'm loving it. Better than Windows...though sadly, I need Windows for school programs. My main reason for joining was to get some help, setting up a tri-boot. \n",
 "Suddernly, my Imac will not launch safari, but the laptop on the network will, so I don't know what the problem is. Can anyone help? I am relatively new to mac.\n",
 "Hi I'm ige. 15 and from the philippines. Enjoying my Macbook Pro bought the other night, and yeah I named her already <3\n",
 "Welcome, you might want to start a new thread as that's not the topic of this one.\n",
 'Hi, My name is Barry.\n\nIm new to the Mac world and Im looking to get the maximum experience and hope to learn and share as much as possible with you guys.\n',
 "Hi everyone, my name is David Eccheli\n\nI am italian and I started thinking about buying a mac in future because I have an ipad now and I really enjoy it. I know italian, german, english and a little bit of French and Latin. I grew up on windows based PCs and liked it, I mean they're not bad but I heard so many good things about macs so I'll probabily switch to mac. I got some experience in computer sectors basically in Hardware.\nCiao a tutti ^^!\n",
 'Welcome all. Glad to have you here at Mac-Forums.\n',
 'Welcome to the new guys. As you have already discovered this is a very friendly place. \n\n@David Judging from your post your English is far superior to the few words of Italian I think I remember.\n',
 'Hello everybody! Mike here...how r u doing?\n',
 'hai there.\nim jezebel im 20 and thats about it.\n',
 "Hey all,\nMy name's Aaron. I'm 23 living in Massachusetts and I'm still using my PowerBook G4. Honestly, though, this seems to be the most functional laptop Apple put out. I'd be lost without a 6-pin USB port. \nI'm excited to join this forum because I'm not particularly tech-savvy and I do run into problems that sometimes feel impossible.\n",
 "Hi! My name's Elaine, and I'm new here (as a registered user anyway). I'm a grad student. I've just gotten a used MacBook mid-2007, and after a hard drive died, I upgraded it a lot - maxed out the memory, and, since it was used, I had to buy all new software. So I'm using Snow Leopard (since I could just buy the $30 upgrade version). I don't need a lot of software on my computer, but I did buy the 2011 Office for Mac (needed for school & work among other things) and I use Firefox. I'm a do-it-yourselfer, so my knowledge about most things reflects that: I know what I've needed to learn to do something. That means I have a lot of holes in my knowledge, so I figured this is a great place to patch them up when they come to my attention!\n\nOn that note, off I go to ask a question... \n",
 'Nice to see so many new faces checking in. Welcome all! Glad to have you on board.\n',
 "Hello there!\n\nI'm Loraine, 22, just graduated from college and making plans to move to Amsterdam. I'm vegan and have the cutest toy poodle in the world. \n\nNeeding a Macbook Pro but wanting a Macbook Air, decisions, decisions, decisions. Current owner of a Toshiba and iPhone 4.\n\nAnd I love sharks! \n",
 'Been here for a few months. Returning to Apple after a 30 year hiatus. My last Apple computer was a II+ way back when memory was measured in bytes. the wife and I have MBPs. I have tons of music and have just about figured out how to "do" everything I want to do with it. I think I need to keep a PC only for MS flight simulator and a few first person shooters. I\'m finding that my PC is off most of the time, now. Thanks to all for the good info posted here. John in Hillbilly Orlando, Tennessee.\n',
 "Hi, My name is Kat and I've lurked on these forums from time to time, mostly when I've needed to find an answer to some kind of Mac issue. I'm looking for an answer that I can't seem to find an answer to and thought it was about time I registered! \n\nI work for a company that doesn't have any Mac support at present, so until this is sorted we are muddling through. I don't have any network experience so it's all fun and games trying to sort out issues that arise. I work in graphic design, presently as an in-house artworker with iMacs. I also have Macs at home and have been working with them on and off in my industry for about 6 years.\n",
 'Hi all. Just got a new 27" Imac. My first Mac. I would like to upgrade the Ram. Where should I post my questions? Thanks!\nBill\n',
 "A hearty welcome to all of you! Have fun and a great learning experience. And as we say in Texas, you'all come back.  \n",
 "My name is Ron and I'm from the great state of Washington, USA. Last weekend I purchased this big bad 27 inch I-MAC desktop computer, my first Mac, and I'm already hooked. I was so fed up with windows and it's many negative issues, especially it's tendency to crash on a regular basis. I only wish I had switched to MAC earlier. Now I look forward to seeing what this baby can really do. Cheers!!\n",
 "Hi!\n\nMy name is Riccardo. I am originally from Italy but I've been living in the states for a couple of years now, specifically Long Island. I've been a long time Linux user and I recently switched to Mac. I can't say that I love it, but I don't hate it as much as I used to  Unfortunately, my Mac journey will end soon (for now!): my new employer is offering me a free pc, and I shouldn't pass up the opportunity (and no, I can't sell it and keep the mac). So here I am, trying to figure out the best way to sell an almost brand new Macbook pro while avoiding eBay fees!  I found out that there is a restriction for new members wrt to access to the marketplace section, and I am wondering how long will it last... Also, if somebody needs help switching to Linux, I guess I can help ;-)\n",
 "Welcome!\n\nUnfortunately, the Buy/Sell/Trade trade forum is only open to well-established members. We don't specify a timeframe or post count (it's a combination of multiple factors) because we don't want to give people a goal to work toward, as that would defeat the purpose of the rule.\n\nI would recommend Craigslist to sell your MacBook. Good luck!\n",
 'Hello, I\'m Trevor. I recently received my 13" Macbook Air, and it\'s simple and intuitive. I\'m having a lot of fun with it, and have a few questions. So i\'m headed over to the search tool to see if there\'s a few answers, otherwise it\'s quiz tim for all of you! \n',
 "Hello everyone, my name is Jorrit and I'm from the Netherlands. I have a Mac Mini. I registered to this forum because I would like to get some help with software difficulties.\n",
 'Hello,\n\nMy name is Beth and I am on my second Mac. Currently, I have a MacBook Pro. I had a Mac book prior. I am on my second iphone and am looking forward to iphone 5. (iphone4 for sale soon)! I am a therapist so no educational background on electronics and systems. But would like to take some classes. I know enough to get me in trouble so hopefully you guys/gals will be able to bail me out when I wreck something! I am your average technophile and like the new stuff. I like the idea of using the iphone for different things other then talking on it and checking email and facebook. I purchased a remote system THE BEACON so I can use my iphone to control my components. That is pretty cool. I have a remote app for the mac to control the DVD player so the only time I have to get off the couch is to plug it in. I will post a picture. \n\n\nBeth\n',
 "Hello, I'm Joanna.. I've had my MacBook since 2007, and I love it. \n\nI love videos and exotic animals (I have 2 skunks :3). I just recently quit playing world of warcraft after 3 years .\n\nI like to think I know a little about computers, but I probably don't know much at all, that is why I am here, to try to get help from people who actually DO know about computers! \n",
 "OK so like most new people i'm a long time windows user and just got tired of the crashes and just the garbage that kept coming with windows in general. I have several friends, coworkers and family that are diehard into apple/mac and recommened that I give it a try. I was given a 3G 16gb iPhone for free from a buddy who had upgraded and I played with it a little just downloading random things and not doing much of anything. I did not activate it bc I am with VZW (dirty language in here prolly lol sorry)(prior to release of iPhone on VZW)(have a Droid X) and I couldnt cancel my contract bc of out of control fees! So I used it as an iPod touch for the most part. My dad upgraded from iPad to iPad2 so I purchased the iPad off of him and again really didnt use it for much but on the go internet and random app downloads. Recently I started using my iPad for work (Career Firefighter/EMT) with all different kinds of things, meeting notes, inspections and whatever else I could really think to use it for. Then I went to the Apple Store on day and I felt as I was in heaven! The CS there was amazing an the CS Rep spent 2 hrs with me just talking and showing me different things with different products. I went back the next day to the same CS rep and purchased a MacBook Pro. So I have had my MacBook for less than a week and I love it for everything I have done with it so far and that has not been much! Again i'm not really sure what to do... lol, I have downloaded so random things and what not but i'm looking for a good start by hearing from you guys/gals! I have downloaded the trial verison of iWorks and I have iLife and have played around with it a little. So yeah basically i'm a total newbie to mac and would love some help and new friends to chat with about this wonderful PC... oops i mean Mac!!!!  So please feel free to hit me up with ideas and what not!!!\n\nThanks!!!!\n\nJeremy\n",
 "Hello from CY!\n\nI just found about you thru search engines but still have a lot of questions about my new computer!\n\nMacbook pro quad core i7 (DON'T REALLY KNOW IF ITS IDENTICAL ON WHAT SHOWS BEYOND!)\n\nHardware Overview:\n\nModel Name:\tMacBook Pro\nModel Identifier:\tMacBookPro8,2\nProcessor Name:\tIntel Core i7\nProcessor Speed:\t2.2 GHz\nNumber of Processors:\t1\nTotal Number of Cores:\t4\nL2 Cache (per Core):\t256 KB\nL3 Cache:\t6 MB\nMemory:\t4 GB\n\n\nI need to ask some questions about my new product so if there is somebody who can help me out just please send me a pm!\n",
 'Is "CY" a place. If so, where is "CY"?\n\nWe don\'t really operate this way. If you have questions...try searching the site first for answers. If you cannot find an answer...then post questions on the forums so everyone can take a shot at answering.\n\nWelcome to Mac-Forums,\n\n- Nick\n',
 "We have been Apple fanatics since the first Apple (the two Steve's - not Eden!). Had one computer for 11 weeks; decided husband was too much of a keyboard hog and bought my own. Have been Mac Consultants for many years. Currently have MacBook Pros with desk displays, etc for in office use. Use iPads, iPhone and Verizon wi-fi card.\n",
 'Hello everyone. My name is Maria R. I am 46 from Guatemala, although I have 1 ½ years since I changed to a MacBook which I learned to use by myself and been very satisfying experience. I am really trying to discover all the apps and cool things about Mac, but it has taken me a lot of time, but hoping to find someone who can help me on how to learn more and where. Any help will be appreciated. And I wish that some day I could help someone in my shoes.\nWhat I love about my Mac is that I can surf the web without worrying me about so much viruses, and I surf a lot.\nHave a a great day.\n',
 'I just purchased a MacbookPro 13". It\'s the first Mac I own but I\'ve used them for the last 15 years while working in media development. I\'ve owned just about every computer you can think of and am not loyal to any one type of computer.\n\nI currently have an MbPRO, Acer Netbook, HTC Evo, often use Ubuntu at home and Solaris at work. \n\nI love all computers and happy to finally own a mac.\n',
 "Hi my nickname is plaiter.\n\nI have been a PC user for 20 odd years and today I finally bought my first iMac.\n\nI have been looking at Mac's for some time as I have ipod, iphone, ipdad & Apple tv.\nBut was put off by talk of compatibility issues and the different interface.\n\nSo today I took the plunge and feel like my first day at school.\n\nI appreciate a lot of my questions will have been asked many times so will check the forums before posting.\n\nIt will be nice to talk to people going through the same experience.\n\n",
 "I have actually owned our iMac for a few years now, but have only gotten to know Macs and OSX recently. I like to say i'm smart enough to know how to \ndo most things on a Mac, and have begun to work with terminal. Everyone I\nknow thinks Macs are nice looking, but being a Windows fan for decades, I think Microsoft has grown stale IMO so I have embraced Macs and have the\nipad and a macbook air on my radar. Anyway, take care and chat w/you all\nlater.\n\n\n\nViper\n",
 'I decided to purchase a MacBook Pro after being a Windows user for a long time. I have done a lot of research and felt that this was the right decision. I am really enjoying my new MacBook Pro and am looking forward to learning more about my computer. \n\nLittle bit about me: I am in my last semester in college and looking forward to applying to Graduate school. \n\nThanks for reading!\n',
 "Hey Everyone! My name is Angie I joined Mac-Forums primarily bc I am having a problem with my MacBook Pro.\nI have been using Apple products for about 4 years, I am rally happy with their products, I have NO technical background, I learn as I go and there's probably a LOT I stillneed to learn about Apple computers. I hope to learn from all of you.\n\n",
 'I joined the Mac forums in 2009 according to the info ! I only pop in here when I have an issue, which is infrequently. \nKarl, from England, I have the supercilious accent and everything.\n\nI\'ve been using a Mac since 1984, when the world was all monochrome and my phone had a wire that went into a wall.\n\nSince Apple Mk2 (the return of Steve), I\'ve had 2 x iMacs, an eMac (still got this for no reason than it\'s too big to lose), a G3, G4 (still got this as a footstool) and a G5. \nRight now I have a MacBookPro 15", a MacBookPro 13", an iPad2, an iPhone, an iPod touch and an iPod Nano. I had Apple TV (mk2) but sold it as I\'m not a heavy TV user and, here in the UK we have the BBC, which pretty much covers everything you need (no adverts & top quality). \n\nEvery 18 months I beta test one Windows software title, so, on Monday I\'m going to load Parallels and Win7 on the 15" book. \n\nHope to give & receive help is equal measure.\n\nThanks for reading \n\nkind regards, Karl.\n',
 'Cyprus ............. that big rock just to the left of Israel at the far end of the Med .......... \n',
 "I am Sunny Taylor. I'm a 62 year old cowgirl, who lives in northern Wyoming. My first computer was a Tandy Color Computer, with a tape drive. I went through all the Apple computers, the last one being an Apple II GS. Then, because I wanted some of the modern programs, I bought a PC. I had four PCs and decided that I had had enough of the Windows programs, so I bought my first MacBook Pro. I am now using a second MacBook Pro. It has been a learning experience and I'm still having some issues with Numbers and Pages. My mind is having a hard time converting from Excel and Word. Trying to do some greeting cards, but fighting the margins, etc. Anyway, I'm joining this forum to learn a little more. I'm considering buying a new printer that will work wirelessly, so any suggestions would be welcome.\n",
 "Hello all, after many many years of procrastination. I have finally converted. I feel like I've changed religions, lol.\n",
 'Hi,\n\nMy name is Abeda Sultana and I work at a school solely based on apple computers, servers and devices.\n\nHave always come back to this forum for solutions and would now like to post a few for which I have not found any.\n\nBest regards to everyone.\n',
 'Welcome everyone to Mac Forums. We all hope you enjoy your stay!\n',
 "I'm considering moving to a mac mini. I'm proficient with and reliant on the\n*nix command line. Tools on linux that I would have a hard time doing without\non mac would be mutt, vim, python (and vim with python) and midnight commander.\nI live in Alaska.\n",
 "Hi All...as with Dan, I'm eagerly anticipating the same iMac this week. It's my first go around with a Mac as I've been a lifelong PC user but felt it was time to take the plunge.\n\nI've been lurking in these forums for a while now but wanted to officially join so I can take part in the conversations and hopefully learn alot from this Mac community.\n\nCheers,\nGavin.\n",
 'Welcome to the Mac Forums.\n\nRead through our forum Stickies and take advantage of our search engine for any information and questions that you may have. And of course feel free to post and ask questions to the membership.\n',
 'Welcome aboard all new users. If we can help you in any way, just ask.\n',
 "Hey guy's my name's Bruno Lima, I honestly am technology uneducated lol and I just came here to sell a few extra things of mine that I frankly don't use and could surely use the money!\n",
 'Welcome Bruno. Unfortunately, our Buy/Sell/Trade forum is not open to new members, but feel free to get engaged in other discussions.\n',
 'How long do I have to be a member to have access to it?\n',
 "We don't specify. The intent of our Buy/Sell/Trade forum is to allow our regular, established members a place to trade goods. \n\nThe requirements are a mix of post count, length of time as a member and reputation level.\n",
 "Hi, my name is Alex. I have just came to possess my first macbook (circa 2008 C2D). Since I've never used a mac before, I'll probably need all advice I can take.  \n",
 "Hello everyone. i am new to this lounge and indeed to this forum. i come here with cap in hand, hopefully to dip into the technical knowledge which i hope resides here.\n\nI have been an apple lover for a little over 10 years having bought my first ipod in 2000. Since then i have continued my love of the brand and have collected a few more apple items along the way. Apple TV2 being the last, a few weeks ago. \n\nAs i start to increase my apple purchases i am now getting increasingly interested and then increasingly confused at the technical principles, qualities, configurations and performance capabilities that come hand in hand with apple rather than the age old industrial design qualities that set apple far above the rest. (the main reason i fell in love with Apple) \n\nAnd here is the problem - i know very little about the IT world and i hope that i will be able to pick a little from each and everyone of you. \n\nWhat I can offer in return, other than a stack full of thanks, is my experience, mainly problems, that i have had in setting up my extremes, extending WIFI, ATV2's, netflicks, lions, networks, streaming (whatever that means), Itunes, servers and everything else i have bought - blindly... if you want to talk about Architecture, buildings, photos, design and UK football, then i can field these also free of charge via the pm button.\n\nThanks in advance for your guidance and help\n",
 "I've been browsing through some of the threads. looks like a helpful place. So here i am. My name is Carl 23yrs old. I've just acquired an old Macbook '07 i think. hopefully you guys can help me. And i would love to help others also, as i learn about these macs!!! lol feel free to message me.\n\nThanks everyone!\n",
 'Hi everyone here I am programmer.\n\nAnd I love mac.Using iOS on iphone or ipad is funnier than other OS because of smoothness and fast !\n\n\n',
 "Hello all. My name is London and I live in Georgia. I just joined yesterday and are trying to learn my way around the site. I'm 59 years young and am looking forward to getting and giving some helpful advice. Hope everyone has a good day!\n",
 'Welcome to Mac Forums KingPappy!\n',
 "What's up guys! Just wanted to introduce myself. My name is _stackz and I recently bought my first ever Apple laptop. Thought I'd join a site like this one with helpful information since I am an Macbook pro newbie.\n"]

In [8]:
d = dict(zip(files, documents))

In [9]:
extractor = CountVectorizerWrapper(input='content', stop_words='english')
detector = OOTDetector(extractor=extractor)

In [10]:
distances = detector.txt_comp_dist(documents, metric='correlation')

In [11]:
distances


Out[11]:
array([ 0.60378334,  0.78561429,  0.57900064,  0.92560692,  0.72125659,
        0.77102393,  0.82561807,  0.74302117,  0.98296684,  0.8081718 ,
        0.84634129,  0.75255459,  0.91056711,  0.67700846,  0.53698482,
        0.87447599,  0.65879189,  0.92087251,  0.82097788,  0.78499152,
        0.85171199,  0.90822814,  0.93019485,  0.92861478,  0.94895027,
        0.89480038,  0.81380972,  0.95681516,  0.69071022,  0.64192176,
        0.95471945,  0.79968589,  1.05196933,  0.77376232,  0.77204975,
        0.69080278,  0.86216311,  0.82352771,  0.69052674,  0.75526087,
        0.59604415,  0.88952196,  0.96346872,  1.0179439 ,  0.97978625,
        0.6814476 ,  0.86773857,  0.89005048,  0.87308202,  0.73143717,
        0.57216096,  0.92207389,  0.72579407,  0.60132503,  0.87600861,
        0.9014299 ,  0.70758033,  0.68308336,  0.75735668,  0.67024684,
        0.79956405,  0.75888282,  0.75954185,  0.66011284,  0.65715068,
        0.71477525,  0.78714419,  0.75049878,  0.69858035,  0.82171042,
        0.96505952,  0.75071523,  0.86460663,  0.84466178,  0.64473132,
        0.81847464,  0.65775171,  0.74294163,  0.73067502,  0.89921753,
        0.84205773,  0.99848671,  0.93960027,  0.67484527,  0.86843199,
        0.73497959,  0.73689223,  0.83655291,  0.60010537,  0.7829984 ])

In [12]:
ranked = sorted(zip(distances, files), reverse=True)

In [13]:
ranked


Out[13]:
[(1.0519693329157791,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-32.txt'),
 (1.0179438981225255,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-43.txt'),
 (0.99848671347549212,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-81.oot.txt'),
 (0.98296684221879227,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-8.txt'),
 (0.97978624533008807,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-44.txt'),
 (0.96505951869716022,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-70.oot.txt'),
 (0.96346871948806834,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-42.txt'),
 (0.95681516088207474,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-27.oot.txt'),
 (0.95471944651394025,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-30.oot.txt'),
 (0.94895027173330848,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-24.txt'),
 (0.93960026711634892,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-82.oot.txt'),
 (0.93019484831049537,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-22.txt'),
 (0.92861477506222934,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-23.txt'),
 (0.92560691638860837,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-3.txt'),
 (0.92207389095148529,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-51.txt'),
 (0.92087251353829136,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-17.oot.txt'),
 (0.91056711166450022,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-12.txt'),
 (0.90822813543552816,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-21.txt'),
 (0.90142990038762016,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-55.txt'),
 (0.89921753205009169,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-79.txt'),
 (0.89480038150274455,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-25.txt'),
 (0.8900504828162501,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-47.txt'),
 (0.88952196185295285,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-41.txt'),
 (0.87600861256579865,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-54.oot.txt'),
 (0.87447599045583735,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-15.oot.txt'),
 (0.87308201875788427,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-48.txt'),
 (0.86843199456853948,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-84.txt'),
 (0.86773856749862477,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-46.txt'),
 (0.86460663088795964,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-72.txt'),
 (0.86216311284092306,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-36.txt'),
 (0.85171198791373737,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-20.txt'),
 (0.84634129261992053,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-10.txt'),
 (0.84466178231270228,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-73.txt'),
 (0.84205773447567589,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-80.oot.txt'),
 (0.83655291489789452,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-87.txt'),
 (0.82561806872120758,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-6.oot.txt'),
 (0.82352770623695948,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-37.oot.txt'),
 (0.82171041945501466,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-69.txt'),
 (0.82097788499037849,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-18.oot.txt'),
 (0.81847463961700817,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-75.txt'),
 (0.81380972346221414,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-26.txt'),
 (0.80817180172101488,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-9.txt'),
 (0.79968588716622202,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-31.txt'),
 (0.79956405121146767,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-60.oot.txt'),
 (0.7871441925223619,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-66.txt'),
 (0.78561428535608235,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-1.txt'),
 (0.78499151561435032,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-19.txt'),
 (0.78299840027657219,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-89.txt'),
 (0.77376232257212307,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-33.txt'),
 (0.77204975255805197,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-34.txt'),
 (0.77102393462590868,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-5.txt'),
 (0.7595418482539581,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-62.txt'),
 (0.75888282490799774,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-61.oot.txt'),
 (0.75735668456552829,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-58.txt'),
 (0.75526086714654728,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-39.txt'),
 (0.75255458801528829,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-11.txt'),
 (0.75071522903379284,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-71.txt'),
 (0.75049878303799789,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-67.txt'),
 (0.74302117162719539,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-7.oot.txt'),
 (0.74294162830089094,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-77.txt'),
 (0.73689222575788738,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-86.txt'),
 (0.73497959050992923,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-85.txt'),
 (0.73143716674470105,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-49.txt'),
 (0.73067501690785153,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-78.txt'),
 (0.7257940661309723,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-52.txt'),
 (0.72125659422975241,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-4.oot.txt'),
 (0.7147752471338904,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-65.txt'),
 (0.70758032662818693,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-56.txt'),
 (0.69858034962081783,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-68.txt'),
 (0.69080278416302054,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-35.oot.txt'),
 (0.69071021561721102,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-28.oot.txt'),
 (0.69052674334358843,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-38.txt'),
 (0.6830833635620801,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-57.txt'),
 (0.68144760106792435,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-45.txt'),
 (0.67700846063356357,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-13.txt'),
 (0.67484526856330351,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-83.txt'),
 (0.67024684106819876,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-59.oot.txt'),
 (0.66011284356377331,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-63.txt'),
 (0.65879189079888989,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-16.oot.txt'),
 (0.65775171481701844,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-76.txt'),
 (0.65715068147571976,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-64.txt'),
 (0.64473131842910147,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-74.txt'),
 (0.64192175950330987,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-29.txt'),
 (0.60378334422682323,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-0.txt'),
 (0.60132502971451296,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-53.txt'),
 (0.60010536885301735,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-88.txt'),
 (0.59604414821650631,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-40.txt'),
 (0.57900064259448469,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-2.txt'),
 (0.57216096308646758,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-50.oot.txt'),
 (0.53698481730876546,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-14.oot.txt')]

In [14]:
len(extractor.vocabulary_)


Out[14]:
1110

In [15]:
extractor2 = CountVectorizerWrapper(input='content', stop_words='english', max_features=111)
detector2 = OOTDetector(extractor=extractor2)

In [16]:
distances2 = detector2.txt_comp_dist(documents, metric='euclidean')

In [17]:
distances2


Out[17]:
array([ 134.89625643,  133.58517882,  131.36590121,  136.56134153,
        133.76471882,  136.20939762,  134.80726983,  134.97036712,
        136.64918587,  135.79764357,  134.04849869,  132.48773528,
        136.60527076,  128.90694318,  129.66495286,  135.05924626,
        132.48773528,  135.26640381,  135.04443713,  136.26811806,
        135.5027675 ,  136.37081799,  134.74791279,  135.84181978,
        136.47344064,  136.73697379,  134.48048186,  136.79546776,
        133.51029923,  133.37541003,  136.2974688 ,  131.97348218,
        136.91238074,  135.60604706,  133.25539389,  134.65882815,
        135.91541487,  136.07718398,  134.25721582,  133.06013678,
        135.44371525,  135.76818479,  136.78084661,  136.94159339,
        136.1800279 ,  129.61867149,  136.19471355,  134.97036712,
        134.46560899,  131.8673576 ,  133.85439851,  136.40014663,
        132.93983602,  128.23805987,  134.77759458,  135.79764357,
        134.98518437,  129.38701635,  133.24038427,  123.24366109,
        132.00378782,  134.1081653 ,  133.67497896,  131.33544838,
        132.03408651,  132.2157328 ,  133.04510513,  133.31541546,
        132.83448347,  131.59407281,  136.59062925,  131.21356637,
        135.90069904,  135.31075345,  135.31075345,  134.59940565,
        132.81942629,  134.40610105,  135.42894816,  135.62079487,
        135.57654664,  136.95619738,  135.85654198,  134.22741896,
        130.83195328,  133.25539389,  135.04443713,  134.97036712,
        135.44371525,  134.95554824])

In [18]:
ranked2 = sorted(zip(distances2, files), reverse=True)

In [19]:
ranked2


Out[19]:
[(136.95619737711763,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-81.oot.txt'),
 (136.94159338929865,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-43.txt'),
 (136.91238074038446,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-32.txt'),
 (136.79546776117988,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-27.oot.txt'),
 (136.78084661238211,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-42.txt'),
 (136.7369737854396,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-25.txt'),
 (136.64918587390119,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-8.txt'),
 (136.60527076214885,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-12.txt'),
 (136.59062925398652,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-70.oot.txt'),
 (136.56134152826706,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-3.txt'),
 (136.4734406395618,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-24.txt'),
 (136.40014662748717,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-51.txt'),
 (136.37081799270692,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-21.txt'),
 (136.2974687952788,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-30.oot.txt'),
 (136.26811806141598,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-19.txt'),
 (136.20939761998804,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-5.txt'),
 (136.194713553794,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-46.txt'),
 (136.18002790424151,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-44.txt'),
 (136.07718398026907,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-37.oot.txt'),
 (135.91541487263319,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-36.txt'),
 (135.90069904161641,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-72.txt'),
 (135.85654198455074,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-82.oot.txt'),
 (135.84181977579658,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-23.txt'),
 (135.79764357307531,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-9.txt'),
 (135.79764357307531,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-55.txt'),
 (135.76818478568535,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-41.txt'),
 (135.62079486568422,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-79.txt'),
 (135.60604706280617,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-33.txt'),
 (135.57654664432192,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-80.oot.txt'),
 (135.5027674994131,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-20.txt'),
 (135.44371524733069,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-88.txt'),
 (135.44371524733069,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-40.txt'),
 (135.42894816101909,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-78.txt'),
 (135.31075345293144,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-74.txt'),
 (135.31075345293144,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-73.txt'),
 (135.26640381114595,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-17.oot.txt'),
 (135.05924625881784,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-15.oot.txt'),
 (135.04443713089407,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-86.txt'),
 (135.04443713089407,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-18.oot.txt'),
 (134.98518437221176,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-56.txt'),
 (134.97036711811967,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-87.txt'),
 (134.97036711811967,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-7.oot.txt'),
 (134.97036711811967,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-47.txt'),
 (134.95554823718808,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-89.txt'),
 (134.89625643434292,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-0.txt'),
 (134.80726983364065,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-6.oot.txt'),
 (134.77759457714032,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-54.oot.txt'),
 (134.74791278531924,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-22.txt'),
 (134.65882815471105,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-35.oot.txt'),
 (134.59940564504734,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-75.txt'),
 (134.48048185517482,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-26.txt'),
 (134.46560898608982,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-48.txt'),
 (134.40610105199838,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-77.txt'),
 (134.25721582097552,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-38.txt'),
 (134.22741895752895,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-83.txt'),
 (134.10816529950739,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-61.oot.txt'),
 (134.04849868610987,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-10.txt'),
 (133.85439850822982,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-50.oot.txt'),
 (133.76471881628578,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-4.oot.txt'),
 (133.67497896016292,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-62.txt'),
 (133.58517881860996,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-1.txt'),
 (133.51029922818688,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-28.oot.txt'),
 (133.37541002748594,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-29.txt'),
 (133.31541546272885,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-67.txt'),
 (133.25539388707685,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-85.txt'),
 (133.25539388707685,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-34.txt'),
 (133.24038426843418,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-58.txt'),
 (133.06013678032951,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-39.txt'),
 (133.04510513355987,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-66.txt'),
 (132.93983601614678,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-52.txt'),
 (132.83448347473634,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-68.txt'),
 (132.81942628998215,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-76.txt'),
 (132.48773528142144,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-16.oot.txt'),
 (132.48773528142144,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-11.txt'),
 (132.2157328006013,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-65.txt'),
 (132.03408650799233,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-64.txt'),
 (132.00378782444085,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-60.oot.txt'),
 (131.97348218486925,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-31.txt'),
 (131.86735759845953,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-49.txt'),
 (131.59407281484982,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-69.txt'),
 (131.36590120727678,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-2.txt'),
 (131.33544837552427,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-63.txt'),
 (131.21356637177422,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-71.txt'),
 (130.83195328359201,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-84.txt'),
 (129.66495285928269,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-14.oot.txt'),
 (129.61867149450345,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-45.txt'),
 (129.38701635017324,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-57.txt'),
 (128.90694317995442,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-13.txt'),
 (128.23805987303459,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-53.txt'),
 (123.24366109459748,
  'datasets/macforums/245842__New-to-macforums-introduce-yourself/post-59.oot.txt')]

Sekarang coba dilihat precision dan recall untuk yang pertama (semua fitur dipake, correlation distance).


In [20]:
import numpy as np
import pandas as pd

In [21]:
data = np.zeros((2, 4))
for i, t in enumerate([1, 3, 5, 22]):
    top_oot = [filename for _, filename in ranked[:t] if filename.endswith('oot.txt')]
    all_oot = [filename for _, filename in ranked if filename.endswith('oot.txt')]
    precision = len(top_oot) / t
    recall = len(top_oot) / len(all_oot)
    data[:,i] = [precision, recall]

In [22]:
res = pd.DataFrame(data, index=['precision', 'recall'], columns=['top 1', 'top 3', 'top 5', 'top 22'])

In [23]:
res


Out[23]:
top 1 top 3 top 5 top 22
precision 0 0.333333 0.200000 0.272727
recall 0 0.045455 0.045455 0.272727

Sekarang dilihat precision dan recall untuk yang kedua (10% fitur dipake, euclidean distance).


In [24]:
data2 = np.zeros((2, 3))
for i, t in enumerate([1, 3, 5]):
    top_oot = [filename for _, filename in ranked2[:t] if filename.endswith('oot.txt')]
    all_oot = [filename for _, filename in ranked2 if filename.endswith('oot.txt')]
    precision = len(top_oot) / t
    recall = len(top_oot) / len(all_oot)
    data2[:,i] = [precision, recall]

In [25]:
res2 = pd.DataFrame(data2, index=['precision', 'recall'], columns=['top 1', 'top 3', 'top 5'])

In [26]:
res2


Out[26]:
top 1 top 3 top 5
precision 1.000000 0.333333 0.400000
recall 0.045455 0.045455 0.090909

In [26]: