Option Explicit
Dim LC As Long 'last column
Dim CX As Long 'active cell column
Dim rSW As Range
Sub Word_Phrase_Frequency_v1()
'The code will generate word/phrase frequency
'How to use:
'1. Add reference to "Microsoft VBScript Regular Expressions 5.5" (you need to do it once only):
' In Visual Basic Editor menu, select Tools –> References, then select Microsoft VBScript Regular Expressions 5.5, then click OK.
'2. Data must be in column A, start at A1
'3. Run Word_Phrase_Frequency_v1
'--- CHANGE sNumber & xPattern VALUE TO SUIT -----------------------------------
Const sNumber As String = "1,2,3" '"1,2,3"
'sNumber = "1" will generate 1 word frequency list
'sNumber = "1,2,3" will generate 1 word, 2 word & 3 word frequency list
Const xPattern As String = "A-Z0-9_'"
'define the word characters, the above pattern will include letter, number, underscore & apostrophe as word character
'word with apostrophe such as "you're" counts as one word.
'word with underscore such as "aa_bb" counts as one word.
'Const xCol As String = "C:ZZ" 'columns to clear
Dim i As Long, j As Long, k As Long, h As Long
Dim txa As String, txq As String
Dim z, t, va, ary, arz
Dim rngA As Range
Set rngA = Application.Selection
Set rngA = Application.InputBox("Put the cursor at the proper column", "", rngA.Address, Type:=8)
rngA.Activate
t = Timer
Application.ScreenUpdating = False
CX = ActiveCell.Column
LC = ActiveSheet.Cells.Find("*", SearchOrder:=xlByColumns, SearchDirection:=xlPrevious).Column
'if there are errors, remove them
On Error Resume Next
Columns(CX).SpecialCells(xlCellTypeFormulas, xlErrors).ClearContents
Columns(CX).SpecialCells(xlConstants, xlErrors).ClearContents
On Error GoTo 0
Cells(1, LC + 2) = Cells(1, CX)
Cells(2, LC + 2) = "Division"
Cells(2, LC + 3) = "WORDS"
Cells(2, LC + 4) = "COUNT"
j = Range("A" & Rows.Count).End(xlUp).Row
va = Range("A1:A" & j)
For i = 2 To UBound(va, 1) 'data start at row 2
k = i
Do
i = i + 1
If i > UBound(va, 1) Then Exit Do
Loop While va(i, 1) = va(i - 1, 1)
i = i - 1
' Debug.Print j & " : " & i
txq = txq & va(i, 1) & ":" & k & ":" & i & ","
Next
'Debug.Print txq
ary = Split(txq, ",")
For h = 0 To UBound(ary) - 1
arz = Split(ary(h), ":")
txa = Join(Application.Transpose(Range(Cells(arz(1), CX), Cells(arz(2), CX))), " ")
'if you have stop words list then put the list starting at A1 in sheet2
With Sheets("Sheet2")
Set rSW = .Range("A1", .Cells(.Rows.Count, "A").End(xlUp))
End With
If rSW.Cells(1) <> "" Then Call stopWord(xPattern, txa)
Debug.Print txa
Call toProcessY(1, txa, xPattern, CStr(arz(0)))
Next
'Range(xCol).Columns.AutoFit
Application.ScreenUpdating = True
Cells(1, LC + 2).Activate
Debug.Print "It's done in: " & Timer - t & " seconds"
End Sub
Sub toProcessY(n As Long, ByVal tx As String, xP As String, div As String)
'phrase frequency
Dim regEx As Object, matches As Object, x As Object, d As Object
Dim i As Long, LR As Long
Dim va, q
Set regEx = CreateObject("VBScript.RegExp")
With regEx
.Global = True
.MultiLine = True
.ignorecase = True
End With
Set d = CreateObject("scripting.dictionary")
d.CompareMode = vbTextCompare
' regEx.Pattern = Trim(WorksheetFunction.Rept("[A-Z0-9_']+ ", n)) 'match n words (the phrase) separated by a space
regEx.Pattern = Trim(WorksheetFunction.Rept("[" & xP & "]+ ", n)) 'match n words (the phrase) separated by a space
Set matches = regEx.Execute(tx)
For Each x In matches
d(CStr(x)) = d(CStr(x)) + 1 'get phrase frequency
Next
For i = 1 To n - 1
regEx.Pattern = "^[" & xP & "]+ "
If regEx.Test(tx) Then
tx = regEx.Replace(tx, "") 'remove first word in each line to get different combination of n words (phrase)
' regEx.Pattern = Trim(WorksheetFunction.Rept("[A-Z0-9_']+ ", n))
regEx.Pattern = Trim(WorksheetFunction.Rept("[" & xP & "]+ ", n))
Set matches = regEx.Execute(tx)
For Each x In matches
d(CStr(x)) = d(CStr(x)) + 1 'get phrase frequency
Next
End If
Next
'If d.Count = 0 Then MsgBox "Nothing with " & n & " word phrase found": Exit Sub
If d.Count = 0 Then Exit Sub
LR = Cells(Rows.Count, LC + 3).End(xlUp).Row + 1
'put the result
With Cells(LR, LC + 3).Resize(d.Count, 2)
.Value = Application.Transpose(Array(d.Keys, d.Items))
.Sort Key1:=.Cells(1, 2), Order1:=xlDescending, Key2:=.Cells(1, 1), Order2:=xlAscending, Header:=xlNo
.Cells(1, 1).Offset(, -1) = div
End With
End Sub
Sub stopWord(xP As String, tx As String)
Dim n As Long
Dim stW, x
Dim regEx As Object
stW = rSW.Value
Set regEx = CreateObject("VBScript.RegExp")
With regEx
.Global = True
.MultiLine = True
.ignorecase = True
End With
tx = " " & tx
For Each x In stW
regEx.Pattern = "[^" & xP & "]" & x & "[^" & xP & "]"
If regEx.Test(tx) Then
tx = regEx.Replace(tx, " ") 'replace stop word with " "
End If
Next
End Sub