' VARIABLE NAME PREFIXES:
' dt = Data Table
' str = String
' int = Integer
' sb = string builder
Public Function CreateNGramsFromStringOfWords(ByVal StringOfWords As String) As DataTable
' OUTPUT TABLE COLUMNS ARE ...
' Output OutputNbrWords StartWordNbr EndWordNbr Input InputNbrWords
' NGram OutputNbrWords StartWordNbr EndWordNbr Input InputNbrWords
Dim dtNGrams As New DataTable ' This is the return value for this Function
Dim strArrayOfWords() As String = Nothing
Dim intFirstWordPntr As Integer = 0
Dim intLastWordPntr As Integer = 0
Dim intLastWordIdx As Integer = 0
Dim sbSetOfWords As New System.Text.StringBuilder
' DIMENSION THE INTERMEDIATE VARZ
Dim strNGram As String
Dim intOutputNbrWords As Integer
Dim intStartWordNbr As Integer
Dim intEndWordNbr As Integer
' ENSURE THAT THESE ARE CLEANED OF PUNCTUATION
StringOfWords = Replace(StringOfWords, ".", "")
StringOfWords = Replace(StringOfWords, "?", "")
StringOfWords = StringOfWords.Trim
strArrayOfWords = Tokenizer(StringOfWords)
intLastWordIdx = strArrayOfWords.Count - 1
' BUILD OUTPUT TABLE COLUMNS
dtNGrams.Columns.Add("NGram")
dtNGrams.Columns.Add("OutputNbrWords", GetType(Integer))
dtNGrams.Columns.Add("StartWordNbr", GetType(Integer))
dtNGrams.Columns.Add("EndWordNbr", GetType(Integer))
dtNGrams.Columns.Add("Input")
dtNGrams.Columns.Add("InputNbrWords", GetType(Integer))
' ** BEGIN LOOPS **
intFirstWordPntr = 0
intLastWordPntr = intLastWordIdx
For Loop3 = intFirstWordPntr To intLastWordPntr
For Loop2 = intFirstWordPntr To intLastWordPntr
For Loop1 = intFirstWordPntr To intLastWordPntr
sbSetOfWords.Append(strArrayOfWords(Loop1) & " ")
Next
'
strNGram = sbSetOfWords.ToString.Trim
intOutputNbrWords = (intLastWordPntr - intFirstWordPntr + 1)
intStartWordNbr = intFirstWordPntr
intEndWordNbr = intLastWordPntr
'
dtNGrams.Rows.Add(strNGram, intOutputNbrWords, intStartWordNbr, intEndWordNbr, StringOfWords, strArrayOfWords.Count)
'
sbSetOfWords.Length = 0 ' Clears the string builder
intLastWordPntr -= 1 ' REMOVES LAST WORD
Next
intLastWordPntr = intLastWordIdx ' Reset pntr to last word index
intFirstWordPntr += 1 ' REMOVES FIRST WORD
Next
Return dtNGrams
End Function
Zoe The Robot. Why generate NGrams?Zoe The Robot. Actual code used to generate NGrams |