[swift-users] Zero cost abstraction 2D Iterator (equivalent to two nested for loops) impossible?

Jens Persson jens at bitcycle.com
Wed Jan 4 14:10:10 CST 2017


I noticed disabling safety checks made the custom Iterator as fast as the
two nested for-loops.
Karl, how does your test change when disabling safety checks?

Anyone having an idea why disabling safety checks should make the two sum
funcs equally fast in my example program?

(It shouldn't be the preconditions of Table2D<>'s subscripts, unless the
optimizer (with safety checks) can remove them in the case of the nested
for-loops but not in the case of the custom iterator.)

/Jens


On Wed, Jan 4, 2017 at 6:59 PM, Jens Persson <jens at bitcycle.com> wrote:

> Thanks, I wonder if it is currently impossible to make it as fast as the
> nested for loops, ie that some optimizer improvement could fix it.
> Here's a stripped down version of my code:
>
>
> import QuartzCore // This is just for timing using CACurrentMediaTime()
>
>
> struct Point2DInt {
>     var x: Int
>     var y: Int
> }
>
> struct Size2DInt {
>     var width: Int
>     var height: Int
>     var rect: Rect2DInt { return Rect2DInt(x: 0, y: 0, width: width,
> height: height) }
> }
>
> struct Rect2DInt : Sequence {
>     var description: String { return "(\(origin), \(size))" }
>     var origin: Point2DInt
>     var size: Size2DInt
>     var minX: Int {
>         get { return origin.x }
>         set { size.width = maxX - newValue; origin.x = newValue }
>     }
>     var maxX: Int {
>         get { return origin.x + size.width }
>         set { size.width = newValue - minX }
>     }
>     var minY: Int {
>         get { return origin.y }
>         set { size.height = maxY - newValue; origin.y = newValue }
>     }
>     var maxY: Int {
>         get { return origin.y + size.height }
>         set { size.height = newValue - minY }
>     }
>     init(origin: Point2DInt, size: Size2DInt) {
>         self.origin = origin
>         self.size = size
>     }
>     init(x: Int, y: Int, width: Int, height: Int) {
>         self.init(origin: Point2DInt(x: x, y: y), size: Size2DInt(width:
> width, height: height))
>     }
>     init(minX: Int, minY: Int, maxX: Int, maxY: Int) {
>         self.init(origin: Point2DInt(x: minX, y: minY),
>                   size: Size2DInt(width: maxX - minX, height: maxY - minY))
>     }
>     func makeIterator() -> Rect2DIntPointIterator {
>         return Rect2DIntPointIterator(rect: self)
>     }
> }
>
>
> // This is the crucial type here, this version is the fastest that
> // I could find, but it's still slower than two nested for loops,
> // see test below.
> struct Rect2DIntPointIterator : IteratorProtocol, Sequence {
>     let startX, startY, stopX, stopY: Int
>     var currentPoint: Point2DInt
>     init(rect: Rect2DInt) {
>         currentPoint = rect.origin
>         startX = rect.origin.x
>         startY = rect.origin.y
>         stopX = rect.maxX
>         stopY = rect.maxY
>     }
>     mutating func next() -> Point2DInt? {
>         defer { currentPoint.x = currentPoint.x &+ 1 }
>         if currentPoint.x == stopX {
>             currentPoint.x = startX
>             currentPoint.y = currentPoint.y &+ 1
>             if currentPoint.y == stopY { return nil }
>         }
>         return currentPoint
>     }
> }
>
>
> struct Table2D<Element> {
>     let size: Size2DInt
>     var storage: [Element]
>     init(size: Size2DInt, filledWith element: Element) {
>         precondition(size.width > 0 && size.height > 0)
>         self.size = size
>         self.storage = [Element](repeating: element, count: size.width *
> size.height)
>     }
>     subscript(x: Int, y: Int) -> Element {
>         get {
>             precondition(x >= 0 && y >= 0 && x < size.width && y <
> size.height)
>             return storage[x + y * size.width]
>         }
>         set {
>             precondition(x >= 0 && y >= 0 && x < size.width && y <
> size.height)
>             storage[x + y * size.width] = newValue
>         }
>     }
>     subscript(position: Point2DInt) -> Element {
>         get { return self[position.x, position.y] }
>         set { self[position.x, position.y] = newValue }
>     }
> }
>
> func randomDouble() -> Double {
>     // Returns a random Double in the range [0, 1)
>     let ui64 = (UInt64(arc4random()) << 32) | UInt64(arc4random())
>     return Double(ui64 >> UInt64(63 - Double.significandBitCount)) *
> .ulpOfOne/2
> }
> func randomInt(from: Int, to: Int) -> Int {
>     // Returns an Int in the range [from, to)
>     return Int(Double(from) + (randomDouble() * Double(to -
> from)).rounded(.down))
> }
>
> func randomSubrects(of rect: Rect2DInt, minSize: Size2DInt, count: Int) ->
> [Rect2DInt] {
>     precondition(count > 0 && minSize.width > 0 && minSize.height > 0)
>     var subrects = [Rect2DInt]()
>     subrects.reserveCapacity(count)
>     for _ in 0 ..< count {
>         let size = Size2DInt(width: randomInt(from: minSize.width, to:
> rect.size.width),
>                              height: randomInt(from: minSize.height, to:
> rect.size.height))
>         let origin = Point2DInt(x: randomInt(from: 0, to: rect.size.width
> - size.width),
>                                 y: randomInt(from: 0, to: rect.size.height
> - size.height))
>         subrects.append(Rect2DInt(origin: origin, size: size))
>     }
>     return subrects
> }
>
>
> func randomTable(size: Size2DInt) -> Table2D<Double> {
>     var table = Table2D(size: size, filledWith: 0.0)
>     for p in table.size.rect { table[p] = randomDouble() }
>     return table
> }
>
>
> func sum1(areas: [Rect2DInt], of table: Table2D<Double>) -> Double {
>     var sum = 0.0
>     for r in areas {
>         // Using custom iterator:
>         for p in r { sum += table[p] }
>     }
>     return sum
> }
>
> func sum2(areas: [Rect2DInt], of table: Table2D<Double>) -> Double {
>     var sum = 0.0
>     for r in areas {
>         // Using two nested for loops:
>         for y in r.minY ..< r.maxY {
>             for x in r.minX ..< r.maxX {
>                 sum = sum + table[x, y]
>             }
>         }
>     }
>     return sum
> }
>
> func test(
>     sumFn: ([Rect2DInt], Table2D<Double>) -> Double,
>     label: String,
>     table: Table2D<Double>,
>     areas: [Rect2DInt]
>     )
> {
>     let t0 = CACurrentMediaTime()
>     let sum = sumFn(areas, table)
>     let t1 = CACurrentMediaTime()
>     print(label, t1 - t0, "seconds (sum \(sum))")
> }
>
> for _ in 0 ..< 4 {
>     let rndTable = randomTable(size: Size2DInt(width: 1000, height: 1000))
>     let rndTableAreas = randomSubrects(of: rndTable.size.rect,
>                                        minSize: Size2DInt(width: 100,
> height: 100),
>                                        count: 1000)
>     test(sumFn: sum1, label: "sum1 - using custom iterator  ", table:
> rndTable, areas: rndTableAreas)
>     test(sumFn: sum2, label: "sum2 - using nested for-loops ", table:
> rndTable, areas: rndTableAreas)
>     print()
> }
>
> //
> // Typical output on my MacBook Pro (Retina, 15-inch, Late 2013):
> //
> // sum1 - using custom iterator   0.480134483019356 seconds (sum
> 153408603.850653)
> // sum2 - using nested for-loops  0.348341046017595 seconds (sum
> 153408603.850653)
> //
> // sum1 - using custom iterator   0.426998238021042 seconds (sum
> 149851816.622638)
> // sum2 - using nested for-loops  0.34111139801098 seconds (sum
> 149851816.622638)
> //
> // sum1 - using custom iterator   0.466021075990284 seconds (sum
> 155267702.297466)
> // sum2 - using nested for-loops  0.351970263000112 seconds (sum
> 155267702.297466)
> //
> // sum1 - using custom iterator   0.426723245007452 seconds (sum
> 146331850.202214)
> // sum2 - using nested for-loops  0.340267747989856 seconds (sum
> 146331850.202214)
> //
>
>
> On Wed, Jan 4, 2017 at 12:42 PM, Karl <razielim at gmail.com> wrote:
>
>> Hmmm that’s interesting. A brief test I ran:
>>
>> import CoreGraphics
>> import Foundation
>>
>> struct PointIterator {
>>   let rect: CGRect
>>   var nextPoint: CGPoint
>>
>>   let maxX: CGFloat
>>   let maxY: CGFloat
>>
>>   init(rect: CGRect) {
>>     self.rect      = rect.standardized
>>     self.nextPoint = self.rect.origin
>>     // Cache for fast iteration
>>     self.maxX      = self.rect.maxX
>>     self.maxY      = self.rect.maxY
>>   }
>>
>>   mutating func next() -> CGPoint? {
>>     guard nextPoint.x <= maxX, nextPoint.y <= maxY else {
>>       return .none
>>     }
>>     defer {
>>       nextPoint.x += 1
>>       if nextPoint.x > maxX {
>>         nextPoint.x  = rect.origin.x
>>         nextPoint.y += 1
>>       }
>>     }
>>     return nextPoint
>>   }
>> }
>>
>> // Use iterator
>> func iteratePoints_it(_ rect: CGRect, with: (CGPoint)->()) {
>>   var it = PointIterator(rect: rect)
>>   while let point = it.next() {
>>     with(point)
>>   }
>> }
>>
>> // Basic unwrapping of the iterator as a function (no ‘defer’)
>> func iteratePoints_fe(_ rect: CGRect, with: (CGPoint)->()) {
>>   let rect      = rect.standardized
>>   var nextPoint = rect.origin
>>   let maxX      = rect.maxX
>>   let maxY      = rect.maxY
>>
>>   while true {
>>     guard nextPoint.x <= maxX, nextPoint.y <= maxY else {
>>       return
>>     }
>>     with(nextPoint)
>>     nextPoint.x += 1
>>     if nextPoint.x > maxX {
>>       nextPoint.x  = rect.origin.x
>>       nextPoint.y += 1
>>     }
>>   }
>> }
>>
>> // for..in loop
>> func iteratePoints_fe2(_ rect: CGRect, with: (CGPoint)->()) {
>>   let rect = rect.standardized
>>   let maxX = rect.maxX
>>   let maxY = rect.maxY
>>   for y in stride(from: rect.origin.y, to: maxY, by: 1) {
>>     for x in stride(from: rect.origin.x, to: maxX, by: 1) {
>>       with(CGPoint(x: x, y: y))
>>     }
>>   }
>> }
>>
>> func profile(_ iterations: Int, _ thing: ()->()) -> TimeInterval {
>>   var totalTime: TimeInterval = 0
>>   for _ in 0..<iterations {
>>     let start = Date().timeIntervalSinceReferenceDate
>>     thing()
>>     totalTime += (Date().timeIntervalSinceReferenceDate - start)
>>   }
>>   return totalTime/TimeInterval(iterations)
>> }
>>
>>
>> let bigRect = CGRect(x: 0, y: 0, width: 10_000, height: 10_000)
>>
>> let iterator = profile(10) { iteratePoints_it(bigRect)  { if $0.x >
>> 1_000_000 { print("?") } } } // always false, won't be optimised out.
>> let foreach = profile(10)  { iteratePoints_fe(bigRect)  { if $0.x >
>> 1_000_000 { print("?") } } }
>> let foreach2 = profile(10) { iteratePoints_fe2(bigRect) { if $0.x >
>> 1_000_000 { print("?") } } }
>> print("iterator: \(iterator) \nforeach: \(foreach) \nforeach2:
>> \(foreach2)")
>>
>>
>> Results:
>>
>> iterator: 0.316907703876495
>> foreach: 0.283202117681503
>> foreach2: 0.568318998813629
>>
>> That ranking is consistent, too. Using an iterator does appear marginally
>> slower than a basic unwrapping of the iterator in to a function.
>>
>> On 4 Jan 2017, at 09:56, Jens Persson via swift-users <
>> swift-users at swift.org> wrote:
>>
>> Hi,
>>
>> I'm working on some low-level pixel processing code (stuff that is not
>> possible to do using standard API or on eg GPU), and I had lots of eg these:
>>
>> for y in someStartY ..< someStopY {
>>     for x in someStartX ..< someStopX {
>>         ... pixels[x, y] ...
>>     }
>> }
>>
>> So I implemented some (value) types like eg IntPoint2D, IntSize2D,
>> IntRect2D and I made an IntRect2DIterator so that IntRect2D could be a
>> Sequence over its (discrete) points. With this I could rewrite the above
>> like so:
>>
>> for pixelPosAsIntPoint2D in someIntRect2D {
>>     ... pixels[pixelPosAsIntPoint2D] ...
>> }
>>
>> For some reason the first version (two nested for loops for x and y) is
>> always a bit faster than the abstracted version no matter how I write it
>> (tried using eg &+ instead of + etc).
>>
>>
>>
>> Is it possible to write as a zero cost abstraction like this, if so, how?
>> If not, why?
>>
>>
>>
>>
>> PS
>>
>> Note that eg this:
>>
>> for y in someStartY ..< someStopY {
>>     for x in someStartX ..< someStopX {
>>        let pixelPosAsIntPoint2D = IntPoint2D(x: x, y: y)
>>         ... pixels[pixelPosAsIntPoint2D] ...
>>     }
>> }
>>
>> is exactly as fast as the top example (using just ... pixels[x, y] ...).
>> So the difference in execution time seems to be due to something in the
>> Iterator and not eg the pixel accessing subscript taking the 2d int point
>> type instead of separate x and y ints.
>>
>> Here is one Iterator variant that I have tested:
>>
>> struct Rect2DIntPointIterator : IteratorProtocol, Sequence {
>>     let startX, startY, stopX, stopY: Int
>>     var px, py: Int
>>     init(rect: Rect2DInt) {
>>         startX = rect.origin.x
>>         startY = rect.origin.y
>>         stopX = rect.maxX
>>         stopY = rect.maxY
>>         px = startX
>>         py = startY
>>     }
>>     mutating func next() -> Point2DInt? {
>>         defer { px = px &+ 1 }
>>         if px == stopX {
>>             px = startX
>>             py = py &+ 1
>>             if py == stopY { return nil }
>>         }
>>         return Point2DInt(x: px, y: py)
>>     }
>> }
>>
>>
>> And here are typical execution times from an example test:
>> 2.1 seconds using my Iterator (the fastest I can get it, no matter how I
>> try to rewrite it).
>> 1.5 seconds using nested x, y for loops.
>>
>> I'm pretty sure my testing is done thoroughly (measuring average of many
>> runs, using random data, avoiding dead code elimination, whole module
>> optimization, etc).
>>
>> I have tried profiling the code and looking at the disassmbly but I'm
>> failing to understand what's going on.
>>
>> So the ultimate answer would be in the form of a (2d, Int) Rectangle type
>> whose (2d, Int) Points can be iterated in a for loop, at zero cost compared
>> to doing the same using two nested for loops. Or an explanation of why this
>> is impossible.
>>
>> DS
>>
>> /Jens
>>
>> _______________________________________________
>> swift-users mailing list
>> swift-users at swift.org
>> https://lists.swift.org/mailman/listinfo/swift-users
>>
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.swift.org/pipermail/swift-users/attachments/20170104/3170b0ab/attachment.html>


More information about the swift-users mailing list