2013-08-08

TSQL - Function to get string returned as a table with one record for each word, and some basic stats

If you create the function (code below), and execute this:
select * from getWords( 'this is only a one one figurative test test ', 0, 1 );

You will get this:


The function's code:

create FUNCTION [dbo].[getWords]
(
       @text nvarchar(4000)
       , @ReturnOnlyMeaningful bit
       , @SuppressDuplicates bit
)
RETURNS
       @words
       TABLE (
              WordID smallint --primary key
              --, Position smallint
              , Word varchar(255)
              , OrderOfAppearance tinyint
              , AlphabeticRank tinyint
              , [Length] tinyint
                     , LengthRank tinyint
              , Frequency tinyint
              , WordCount tinyint
                     , DistinctWordCount tinyint
                    
              , Meaningful tinyint
              , MeaningfulWordCount tinyint
                     , MeaningfulDistinctWordCount tinyint
             
              , OriginalText varchar(4000)
       )
AS
BEGIN

       -- Parameter validation and cleanup
      
              -- Do not process empty strings
              if ltrim(rtrim(@text)) = ''
              begin
                     return
              end

              -- Double quote characters if found
              /*
                     select REPLACE( 'this is" something', '"', '""' );
                     select REPLACE( 'this is'' something', '''', '''''' );
              */
              set @text = REPLACE( @text, '"', '""' );
              set @text = REPLACE( @text, '''', '''''' );
             



       -- If it is ok to display dupicates
       if @SuppressDuplicates = 0
       begin
              insert into
                     @words
              select
                     occurrence
                     , display_term
                     , dense_rank()
                           over(
                                  order by occurrence asc
                           )
                           as OrderOfAppearance
                     , DENSE_RANK()
                           over(
                                  order by display_term asc
                           ) as AlphabeticRank
                     , LEN( display_term ) as [Length]
                     , dense_rank()
                           over(
                                  /*
                                  partition by 
                                         case  
                                                when special_term = 'Exact Match'
                                                then 1
                                                else 0
                                         end
                                  */                        
                                  order by LEN( display_term ) desc
                           ) as isMeaningfulLengthRank                           
                     , CAST( 1 as tinyint ) as Frequency     
                     , null as WordCount
                     , null as DistinctWordCount
                     , cast(
                           case  
                                  when special_term = 'Exact Match'
                                  then 1
                                  else 0
                           end
                           as tinyint
                     ) as isMeaningful         
                     , null as MeaningfulWordCount
                     , null as MeaningfulDistinctWordCount
                     , @text as OriginalText   
              from
                     sys.dm_fts_parser ( N'"' + @text + '"', 1033, 0, 0)
              where
                     -- Return only meaningful, according to the parameter
                     (
                           @ReturnOnlyMeaningful = 0
                           or (
                                  @ReturnOnlyMeaningful = 1
                                  and special_term = 'Exact Match'
                           )
                     )
                     -- Do not return end of file word.
                     and display_term not in
                     (
                           'END OF FILE' -- Seems to be produced by hyphens
                     )
              ;
       end
      
      
      
       -- If we are to suppress duplicates
       else
       begin
              insert into
                     @words
              select
                     min( occurrence ) as occurrenceMin
                     , display_term
                     , dense_rank()
                           over(
                                  order by min( occurrence ) asc
                           )
                           as OrderOfAppearance             
                     , DENSE_RANK()
                           over(
                                  order by display_term asc
                           ) as AlphabeticRank              
                     , LEN( display_term ) as [Length]
                     , dense_rank()
                           over(
                                  /*
                                  partition by 
                                         case  
                                                when special_term = 'Exact Match'
                                                then 1
                                                else 0
                                         end                       
                                  */
                                  order by LEN( display_term ) desc
                           ) as isMeaningfulLengthRank                                         
                     , COUNT(*) as Frequency   
                     , null as WordCount 
                     , null as DistinctWordCount
                     , cast(
                           case  
                                  when special_term = 'Exact Match'
                                  then 1
                                  else 0
                           end
                           as tinyint
                     ) as isMeaningful                
                     , null as MeaningfulWordCount
                     , null as MeaningfulDistinctWordCount                 
                     , @text as OriginalText   
              from
                     sys.dm_fts_parser ( N'"' + @text + '"', 1033, 0, 0 )
              where
                     -- Return only meaningful, according to the parameter
                     (
                           @ReturnOnlyMeaningful = 0
                           or (
                                  @ReturnOnlyMeaningful = 1
                                  and special_term = 'Exact Match'
                           )
                     )
                     -- Do not return end of file word.
                     and display_term not in
                     (
                           'END OF FILE'
                     )
              group by
                     display_term
                     , case
                           when special_term = 'Exact Match'
                           then 1
                           else 0
                     end
              ;     
       end
      
      
       -- Word count
       update
              @words
       set
              WordCount = ( select SUM(Frequency) from @words )
              , DistinctWordCount = ( select COUNT(*) from @words )
              , MeaningfulWordCount = ( select SUM( Meaningful * Frequency ) from @words )
              , MeaningfulDistinctWordCount = ( select SUM( Meaningful ) from @words )
       ;
      
      
       -- Return
       return;
      
end



No comments:

Post a Comment