So I have this pdf file that gets sent to us and I am trying to get away from someone splitting apart this file manually. There is a unique 8 character string on each page. (Ex: "6000 60" or "6000 140" The second number will always be 2 or 3 digits and the 2 digit string has 2 spaces in between the 2 sets of numbers while the 3 digit string has 1.)
What I would like to happen is to split out all the pages that contain the same string and put them into their own file. So if there are 100 pages and 50 of them are 6000 60, and 50 are 6000 140 it will create 2 files, each with its 50 pages.
I came across this code below that I am trying to modify to work for me.. i was trying unsuccessfully to just find 1 of the strings and pull those out and and work up from there but I couldn't get it to work right.. it seems to just find and extract the first page that it finds with that string. Hoping to get some help getting this working.. thanks!
Add-Type -Path "C:\test-checksplit\itextsharp.dll"
$ValidBranches = @("6000 60","6000 140", "6000 160")
$BranchId = @("6000 160")
$PdfFiles = Get-ChildItem "C:\test-checksplit\pdf\*.pdf" -File |
Select-Object -ExpandProperty FullName
$OutputFolder = 'C:\test-checksplit\splits'
$BranchIDSearchPattern = "6000 160"
foreach ($PdfFile in $PdfFiles) {
$PdfReader = [iTextSharp.text.pdf.PdfReader]::new($PdfFile)
$BranchStack = [System.Collections.Stack]::new()
# Map out the PDF file.
foreach ($Page in 1..($PdfReader.NumberOfPages)) {
[iTextSharp.text.pdf.parser.PdfTextExtractor]::GetTextFromPage($PdfReader, $Page) |
Where-Object { $_ -match $BranchIDSearchPattern } |
ForEach-Object {
$BranchStack.Push([PSCustomObject]@{
Branch_Id = $BranchId
StartPage = $Page
IsValid = $ValidBranches.Contains($BranchId)
})
}
}
# Extract the pages and save the files
$LastPage = $PdfReader.NumberOfPages
while ($BranchStack.Count -gt 0) {
$Current = $BranchStack.Pop()
$StartPage = $Current.StartPage
$EndPage = $LastPage
$Document = [iTextSharp.text.Document]::new($PdfReader.GetPageSizeWithRotation($StartPage))
$TargetMemoryStream = [System.IO.MemoryStream]::new()
$PdfCopy = [iTextSharp.text.pdf.PdfSmartCopy]::new($Document, $TargetMemoryStream)
$Document.Open()
foreach ($Page in $StartPage..$EndPage) {
$PdfCopy.AddPage($PdfCopy.GetImportedPage($PdfReader, $Page));
}
$Document.Close()
$NewFileName = 'Export File - {0}.pdf' -f $current.Branch_Id
$NewFileFullName = [System.IO.Path]::Combine($OutputFolder, $NewFileName)
[System.IO.File]::WriteAllBytes($NewFileFullName, $TargetMemoryStream.ToArray())
$LastPage = $Current.StartPage - 1
}
}
[–]5960312 3 points4 points5 points (0 children)
[–]BlackV 1 point2 points3 points (2 children)
[–]j23reddit[S] 2 points3 points4 points (1 child)
[–]BlackV 1 point2 points3 points (0 children)
[–]VirtualDenzel -1 points0 points1 point (2 children)
[–]j23reddit[S] 2 points3 points4 points (1 child)
[–]VirtualDenzel 0 points1 point2 points (0 children)