Extract image from specific page to PDF

I want to extract an image from a PDF file. I tried using the following code, and it perfectly extracted the jpeg image from the PDF. The problem is how to extract an image from a specific page, for example. Page 1 or from another page. I do not want to read the entire PDF file for image search.

Any suggestions?

Image Retrieval Code:

private void List<System.Drawing.Image> ExtractImages(String PDFSourcePath) { List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>(); iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null; iTextSharp.text.pdf.PdfReader PDFReaderObj = null; iTextSharp.text.pdf.PdfObject PDFObj = null; iTextSharp.text.pdf.PdfStream PDFStremObj = null; try { RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(PDFSourcePath); PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null); for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++) { PDFObj = PDFReaderObj.GetPdfObject(i); if ((PDFObj != null) && PDFObj.IsStream()) { PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj; iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE); if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString()) { byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj); if ((bytes != null)) { try { System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes); MS.Position = 0; System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS); pictureBox1.Image = ImgPDF; MS.Close(); MS.Flush(); } catch (Exception) { } } } } } PDFReaderObj.Close(); } catch (Exception ex) { throw new Exception(ex.Message); } } 
+4
source share
3 answers

I don't have iTextSharp 4.0 currently available, so this code targets 5.2, but it should work just fine for older too. This code is almost a direct elevator from this post here , so please see this post, as well as answers to additional questions. As I said in the comments above, your code looks at all the images from the point of view of the document, and the code that I linked goes through the pages.

Please read all the comments in another post, especially this one that explains that this ONLY works for JPG images. There are many different types of images that PDF supports, so if you don’t know that you are only dealing with JPG, you need to add a bunch of more code. See this post and this post for some tips.

  string testFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Doc1.pdf"); string outputPath = Environment.GetFolderPath(Environment.SpecialFolder.Desktop); int pageNum = 1; PdfReader pdf = new PdfReader(testFile); PdfDictionary pg = pdf.GetPageN(pageNum); PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)); PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)); if (xobj == null) { return; } foreach (PdfName name in xobj.Keys) { PdfObject obj = xobj.Get(name); if (!obj.IsIndirect()) { continue; } PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj); PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)); if (!type.Equals(PdfName.IMAGE)) { continue; } int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture)); PdfObject pdfObj = pdf.GetPdfObject(XrefIndex); PdfStream pdfStrem = (PdfStream)pdfObj; byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem); if (bytes == null) { continue; } using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes)) { memStream.Position = 0; System.Drawing.Image img = System.Drawing.Image.FromStream(memStream); if (!Directory.Exists(outputPath)) Directory.CreateDirectory(outputPath); string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNum)); System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1); parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0); var jpegEncoder = ImageCodecInfo.GetImageEncoders().ToList().Find(x => x.FormatID == ImageFormat.Jpeg.Guid); img.Save(path, jpegEncoder, parms); } } 
+8
source

Below is the code that I use to extract images from a PDF. This works great for me.

 // Required: iTextSharp.dll using System.Drawing; using System.Drawing.Imaging; using System.IO; using iTextSharp.text.pdf.parser; using Dotnet = System.Drawing.Image; using iTextSharp.text.pdf; namespace PDF_Parsing { partial class ExtractPdfImage { string imgPath = @"c:\extractedImg.png"; private void ExtractImage(string pdfFile) { const int pageNumber = 1; PdfReader pdf = new PdfReader(pdfFile); PdfDictionary pg = pdf.GetPageN(pageNumber); PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)); PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)); foreach (PdfName name in xobj.Keys) { PdfObject obj = xobj.Get(name); if (obj.IsIndirect()) { PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj); string width = tg.Get(PdfName.WIDTH).ToString(); string height = tg.Get(PdfName.HEIGHT).ToString(); ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg); RenderImage(imgRI); } } } private void RenderImage(ImageRenderInfo renderInfo) { PdfImageObject image = renderInfo.GetImage(); using (Dotnet dotnetImg = image.GetDrawingImage()) { if (dotnetImg != null) { using (MemoryStream ms = new MemoryStream()) { dotnetImg.Save(ms, ImageFormat.Tiff); Bitmap d = new Bitmap(dotnetImg); d.Save(imgPath); } } } } } } 
+2
source

The following code works great to extract an image from a specific page.

 using System.Drawing; using System.Drawing.Imaging; using System.IO; using iTextSharp.text.pdf.parser; using Dotnet = System.Drawing.Image; using iTextSharp.text.pdf; namespace PDF_Parsing { partial class PDF_ImgExtraction { string imgPath; private void ExtractImage(string pdfFile) { const int pageNumber = 1;//Page number to extract the image from PdfReader pdf = new PdfReader(pdfFile); PdfDictionary pg = pdf.GetPageN(pageNumber); PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)); PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)); foreach (PdfName name in xobj.Keys) { PdfObject obj = xobj.Get(name); if (obj.IsIndirect()) { PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj); string width = tg.Get(PdfName.WIDTH).ToString(); string height = tg.Get(PdfName.HEIGHT).ToString(); ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg); RenderImage(imgRI); } } } private void RenderImage(ImageRenderInfo renderInfo) { PdfImageObject image = renderInfo.GetImage(); using (Dotnet dotnetImg = image.GetDrawingImage()) { if (dotnetImg != null) { using (MemoryStream ms = new MemoryStream()) { dotnetImg.Save(ms, ImageFormat.Tiff); Bitmap d = new Bitmap(dotnetImg); d.Save(imgPath); } } } } } } 
0
source

Source: https://habr.com/ru/post/1413575/


All Articles