[swift-users] Zero cost abstraction 2D Iterator (equivalent to two nested for loops) impossible?

Wed Jan 4 11:59:25 CST 2017

Thanks, I wonder if it is currently impossible to make it as fast as the
nested for loops, ie that some optimizer improvement could fix it.
Here's a stripped down version of my code:

import QuartzCore // This is just for timing using CACurrentMediaTime()

struct Point2DInt {
    var x: Int
    var y: Int
}

struct Size2DInt {
    var width: Int
    var height: Int
    var rect: Rect2DInt { return Rect2DInt(x: 0, y: 0, width: width,
height: height) }
}

struct Rect2DInt : Sequence {
    var description: String { return "(\(origin), \(size))" }
    var origin: Point2DInt
    var size: Size2DInt
    var minX: Int {
        get { return origin.x }
        set { size.width = maxX - newValue; origin.x = newValue }
    }
    var maxX: Int {
        get { return origin.x + size.width }
        set { size.width = newValue - minX }
    }
    var minY: Int {
        get { return origin.y }
        set { size.height = maxY - newValue; origin.y = newValue }
    }
    var maxY: Int {
        get { return origin.y + size.height }
        set { size.height = newValue - minY }
    }
    init(origin: Point2DInt, size: Size2DInt) {
        self.origin = origin
        self.size = size
    }
    init(x: Int, y: Int, width: Int, height: Int) {
        self.init(origin: Point2DInt(x: x, y: y), size: Size2DInt(width:
width, height: height))
    }
    init(minX: Int, minY: Int, maxX: Int, maxY: Int) {
        self.init(origin: Point2DInt(x: minX, y: minY),
                  size: Size2DInt(width: maxX - minX, height: maxY - minY))
    }
    func makeIterator() -> Rect2DIntPointIterator {
        return Rect2DIntPointIterator(rect: self)
    }
}

// This is the crucial type here, this version is the fastest that
// I could find, but it's still slower than two nested for loops,
// see test below.
struct Rect2DIntPointIterator : IteratorProtocol, Sequence {
    let startX, startY, stopX, stopY: Int
    var currentPoint: Point2DInt
    init(rect: Rect2DInt) {
        currentPoint = rect.origin
        startX = rect.origin.x
        startY = rect.origin.y
        stopX = rect.maxX
        stopY = rect.maxY
    }
    mutating func next() -> Point2DInt? {
        defer { currentPoint.x = currentPoint.x &+ 1 }
        if currentPoint.x == stopX {
            currentPoint.x = startX
            currentPoint.y = currentPoint.y &+ 1
            if currentPoint.y == stopY { return nil }
        }
        return currentPoint
    }
}

struct Table2D<Element> {
    let size: Size2DInt
    var storage: [Element]
    init(size: Size2DInt, filledWith element: Element) {
        precondition(size.width > 0 && size.height > 0)
        self.size = size
        self.storage = [Element](repeating: element, count: size.width *
size.height)
    }
    subscript(x: Int, y: Int) -> Element {
        get {
            precondition(x >= 0 && y >= 0 && x < size.width && y <
size.height)
            return storage[x + y * size.width]
        }
        set {
            precondition(x >= 0 && y >= 0 && x < size.width && y <
size.height)
            storage[x + y * size.width] = newValue
        }
    }
    subscript(position: Point2DInt) -> Element {
        get { return self[position.x, position.y] }
        set { self[position.x, position.y] = newValue }
    }
}

func randomDouble() -> Double {
    // Returns a random Double in the range [0, 1)
    let ui64 = (UInt64(arc4random()) << 32) | UInt64(arc4random())
    return Double(ui64 >> UInt64(63 - Double.significandBitCount)) *
.ulpOfOne/2
}
func randomInt(from: Int, to: Int) -> Int {
    // Returns an Int in the range [from, to)
    return Int(Double(from) + (randomDouble() * Double(to -
from)).rounded(.down))
}

func randomSubrects(of rect: Rect2DInt, minSize: Size2DInt, count: Int) ->
[Rect2DInt] {
    precondition(count > 0 && minSize.width > 0 && minSize.height > 0)
    var subrects = [Rect2DInt]()
    subrects.reserveCapacity(count)
    for _ in 0 ..< count {
        let size = Size2DInt(width: randomInt(from: minSize.width, to:
rect.size.width),
                             height: randomInt(from: minSize.height, to:
rect.size.height))
        let origin = Point2DInt(x: randomInt(from: 0, to: rect.size.width -
size.width),
                                y: randomInt(from: 0, to: rect.size.height
- size.height))
        subrects.append(Rect2DInt(origin: origin, size: size))
    }
    return subrects
}

func randomTable(size: Size2DInt) -> Table2D<Double> {
    var table = Table2D(size: size, filledWith: 0.0)
    for p in table.size.rect { table[p] = randomDouble() }
    return table
}

func sum1(areas: [Rect2DInt], of table: Table2D<Double>) -> Double {
    var sum = 0.0
    for r in areas {
        // Using custom iterator:
        for p in r { sum += table[p] }
    }
    return sum
}

func sum2(areas: [Rect2DInt], of table: Table2D<Double>) -> Double {
    var sum = 0.0
    for r in areas {
        // Using two nested for loops:
        for y in r.minY ..< r.maxY {
            for x in r.minX ..< r.maxX {
                sum = sum + table[x, y]
            }
        }
    }
    return sum
}

func test(
    sumFn: ([Rect2DInt], Table2D<Double>) -> Double,
    label: String,
    table: Table2D<Double>,
    areas: [Rect2DInt]
    )
{
    let t0 = CACurrentMediaTime()
    let sum = sumFn(areas, table)
    let t1 = CACurrentMediaTime()
    print(label, t1 - t0, "seconds (sum \(sum))")
}

for _ in 0 ..< 4 {
    let rndTable = randomTable(size: Size2DInt(width: 1000, height: 1000))
    let rndTableAreas = randomSubrects(of: rndTable.size.rect,
                                       minSize: Size2DInt(width: 100,
height: 100),
                                       count: 1000)
    test(sumFn: sum1, label: "sum1 - using custom iterator  ", table:
rndTable, areas: rndTableAreas)
    test(sumFn: sum2, label: "sum2 - using nested for-loops ", table:
rndTable, areas: rndTableAreas)
    print()
}

//
// Typical output on my MacBook Pro (Retina, 15-inch, Late 2013):
//
// sum1 - using custom iterator   0.480134483019356 seconds (sum
153408603.850653)
// sum2 - using nested for-loops  0.348341046017595 seconds (sum
153408603.850653)
//
// sum1 - using custom iterator   0.426998238021042 seconds (sum
149851816.622638)
// sum2 - using nested for-loops  0.34111139801098 seconds (sum
149851816.622638)
//
// sum1 - using custom iterator   0.466021075990284 seconds (sum
155267702.297466)
// sum2 - using nested for-loops  0.351970263000112 seconds (sum
155267702.297466)
//
// sum1 - using custom iterator   0.426723245007452 seconds (sum
146331850.202214)
// sum2 - using nested for-loops  0.340267747989856 seconds (sum
146331850.202214)
//

On Wed, Jan 4, 2017 at 12:42 PM, Karl <razielim at gmail.com> wrote:

> Hmmm that’s interesting. A brief test I ran:
>
> import CoreGraphics
> import Foundation
>
> struct PointIterator {
>   let rect: CGRect
>   var nextPoint: CGPoint
>
>   let maxX: CGFloat
>   let maxY: CGFloat
>
>   init(rect: CGRect) {
>     self.rect      = rect.standardized
>     self.nextPoint = self.rect.origin
>     // Cache for fast iteration
>     self.maxX      = self.rect.maxX
>     self.maxY      = self.rect.maxY
>   }
>
>   mutating func next() -> CGPoint? {
>     guard nextPoint.x <= maxX, nextPoint.y <= maxY else {
>       return .none
>     }
>     defer {
>       nextPoint.x += 1
>       if nextPoint.x > maxX {
>         nextPoint.x  = rect.origin.x
>         nextPoint.y += 1
>       }
>     }
>     return nextPoint
>   }
> }
>
> // Use iterator
> func iteratePoints_it(_ rect: CGRect, with: (CGPoint)->()) {
>   var it = PointIterator(rect: rect)
>   while let point = it.next() {
>     with(point)
>   }
> }
>
> // Basic unwrapping of the iterator as a function (no ‘defer’)
> func iteratePoints_fe(_ rect: CGRect, with: (CGPoint)->()) {
>   let rect      = rect.standardized
>   var nextPoint = rect.origin
>   let maxX      = rect.maxX
>   let maxY      = rect.maxY
>
>   while true {
>     guard nextPoint.x <= maxX, nextPoint.y <= maxY else {
>       return
>     }
>     with(nextPoint)
>     nextPoint.x += 1
>     if nextPoint.x > maxX {
>       nextPoint.x  = rect.origin.x
>       nextPoint.y += 1
>     }
>   }
> }
>
> // for..in loop
> func iteratePoints_fe2(_ rect: CGRect, with: (CGPoint)->()) {
>   let rect = rect.standardized
>   let maxX = rect.maxX
>   let maxY = rect.maxY
>   for y in stride(from: rect.origin.y, to: maxY, by: 1) {
>     for x in stride(from: rect.origin.x, to: maxX, by: 1) {
>       with(CGPoint(x: x, y: y))
>     }
>   }
> }
>
> func profile(_ iterations: Int, _ thing: ()->()) -> TimeInterval {
>   var totalTime: TimeInterval = 0
>   for _ in 0..<iterations {
>     let start = Date().timeIntervalSinceReferenceDate
>     thing()
>     totalTime += (Date().timeIntervalSinceReferenceDate - start)
>   }
>   return totalTime/TimeInterval(iterations)
> }
>
>
> let bigRect = CGRect(x: 0, y: 0, width: 10_000, height: 10_000)
>
> let iterator = profile(10) { iteratePoints_it(bigRect)  { if $0.x >
> 1_000_000 { print("?") } } } // always false, won't be optimised out.
> let foreach = profile(10)  { iteratePoints_fe(bigRect)  { if $0.x >
> 1_000_000 { print("?") } } }
> let foreach2 = profile(10) { iteratePoints_fe2(bigRect) { if $0.x >
> 1_000_000 { print("?") } } }
> print("iterator: \(iterator) \nforeach: \(foreach) \nforeach2:
> \(foreach2)")
>
>
> Results:
>
> iterator: 0.316907703876495
> foreach: 0.283202117681503
> foreach2: 0.568318998813629
>
> That ranking is consistent, too. Using an iterator does appear marginally
> slower than a basic unwrapping of the iterator in to a function.
>
> On 4 Jan 2017, at 09:56, Jens Persson via swift-users <
> swift-users at swift.org> wrote:
>
> Hi,
>
> I'm working on some low-level pixel processing code (stuff that is not
> possible to do using standard API or on eg GPU), and I had lots of eg these:
>
> for y in someStartY ..< someStopY {
>     for x in someStartX ..< someStopX {
>         ... pixels[x, y] ...
>     }
> }
>
> So I implemented some (value) types like eg IntPoint2D, IntSize2D,
> IntRect2D and I made an IntRect2DIterator so that IntRect2D could be a
> Sequence over its (discrete) points. With this I could rewrite the above
> like so:
>
> for pixelPosAsIntPoint2D in someIntRect2D {
>     ... pixels[pixelPosAsIntPoint2D] ...
> }
>
> For some reason the first version (two nested for loops for x and y) is
> always a bit faster than the abstracted version no matter how I write it
> (tried using eg &+ instead of + etc).
>
>
>
> Is it possible to write as a zero cost abstraction like this, if so, how?
> If not, why?
>
>
>
>
> PS
>
> Note that eg this:
>
> for y in someStartY ..< someStopY {
>     for x in someStartX ..< someStopX {
>        let pixelPosAsIntPoint2D = IntPoint2D(x: x, y: y)
>         ... pixels[pixelPosAsIntPoint2D] ...
>     }
> }
>
> is exactly as fast as the top example (using just ... pixels[x, y] ...).
> So the difference in execution time seems to be due to something in the
> Iterator and not eg the pixel accessing subscript taking the 2d int point
> type instead of separate x and y ints.
>
> Here is one Iterator variant that I have tested:
>
> struct Rect2DIntPointIterator : IteratorProtocol, Sequence {
>     let startX, startY, stopX, stopY: Int
>     var px, py: Int
>     init(rect: Rect2DInt) {
>         startX = rect.origin.x
>         startY = rect.origin.y
>         stopX = rect.maxX
>         stopY = rect.maxY
>         px = startX
>         py = startY
>     }
>     mutating func next() -> Point2DInt? {
>         defer { px = px &+ 1 }
>         if px == stopX {
>             px = startX
>             py = py &+ 1
>             if py == stopY { return nil }
>         }
>         return Point2DInt(x: px, y: py)
>     }
> }
>
>
> And here are typical execution times from an example test:
> 2.1 seconds using my Iterator (the fastest I can get it, no matter how I
> try to rewrite it).
> 1.5 seconds using nested x, y for loops.
>
> I'm pretty sure my testing is done thoroughly (measuring average of many
> runs, using random data, avoiding dead code elimination, whole module
> optimization, etc).
>
> I have tried profiling the code and looking at the disassmbly but I'm
> failing to understand what's going on.
>
> So the ultimate answer would be in the form of a (2d, Int) Rectangle type
> whose (2d, Int) Points can be iterated in a for loop, at zero cost compared
> to doing the same using two nested for loops. Or an explanation of why this
> is impossible.
>
> DS
>
> /Jens
>
> _______________________________________________
> swift-users mailing list
> swift-users at swift.org
> https://lists.swift.org/mailman/listinfo/swift-users
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.swift.org/pipermail/swift-users/attachments/20170104/9a45e2e8/attachment.html>